From 2b84be544e4a27f7e8e80827e9c85c8f0d58b4ce Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 8 Apr 2020 10:15:51 +0100 Subject: COMPMID-3280: Make all ML primitives for CL use the new interface - Part 2 - CLFunctions have been updated Change-Id: Ie3256a6c775bc12f3126482bd8e8a46da54b267c Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3053 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../runtime/CL/functions/CLAbsoluteDifference.h | 10 +- arm_compute/runtime/CL/functions/CLAccumulate.h | 25 +- .../runtime/CL/functions/CLActivationLayer.h | 11 + .../runtime/CL/functions/CLArgMinMaxLayer.h | 9 + .../CL/functions/CLBatchNormalizationLayer.h | 21 +- .../runtime/CL/functions/CLBatchToSpaceLayer.h | 17 ++ arm_compute/runtime/CL/functions/CLBitwiseAnd.h | 10 +- arm_compute/runtime/CL/functions/CLBitwiseNot.h | 9 +- arm_compute/runtime/CL/functions/CLBitwiseOr.h | 10 +- arm_compute/runtime/CL/functions/CLBitwiseXor.h | 10 +- .../runtime/CL/functions/CLBoundingBoxTransform.h | 14 +- arm_compute/runtime/CL/functions/CLBox3x3.h | 11 +- arm_compute/runtime/CL/functions/CLCannyEdge.h | 16 +- arm_compute/runtime/CL/functions/CLCast.h | 22 +- .../runtime/CL/functions/CLChannelCombine.h | 21 +- .../runtime/CL/functions/CLChannelExtract.h | 18 +- .../runtime/CL/functions/CLChannelShuffleLayer.h | 8 + arm_compute/runtime/CL/functions/CLColorConvert.h | 32 ++- arm_compute/runtime/CL/functions/CLComparison.h | 24 ++ .../runtime/CL/functions/CLComputeAllAnchors.h | 11 +- .../runtime/CL/functions/CLConcatenateLayer.h | 14 +- .../CL/functions/CLConvertFullyConnectedWeights.h | 24 +- arm_compute/runtime/CL/functions/CLConvolution.h | 38 ++- .../runtime/CL/functions/CLConvolutionLayer.h | 23 ++ arm_compute/runtime/CL/functions/CLCopy.h | 10 +- arm_compute/runtime/CL/functions/CLCropResize.h | 28 +- .../runtime/CL/functions/CLDeconvolutionLayer.h | 13 + .../CL/functions/CLDeconvolutionLayerUpsample.h | 8 + .../runtime/CL/functions/CLDepthConvertLayer.h | 23 +- .../runtime/CL/functions/CLDepthToSpaceLayer.h | 8 + .../CL/functions/CLDepthwiseConvolutionLayer.h | 65 +++++ .../runtime/CL/functions/CLDequantizationLayer.h | 8 + arm_compute/runtime/CL/functions/CLDerivative.h | 15 +- arm_compute/runtime/CL/functions/CLDilate.h | 11 +- .../CL/functions/CLDirectConvolutionLayer.h | 16 ++ .../CL/functions/CLDirectDeconvolutionLayer.h | 15 ++ .../runtime/CL/functions/CLElementWiseUnaryLayer.h | 51 +++- .../runtime/CL/functions/CLElementwiseOperations.h | 79 ++++++ .../runtime/CL/functions/CLEqualizeHistogram.h | 9 +- arm_compute/runtime/CL/functions/CLErode.h | 11 +- arm_compute/runtime/CL/functions/CLFFT1D.h | 10 +- arm_compute/runtime/CL/functions/CLFFT2D.h | 10 +- .../runtime/CL/functions/CLFFTConvolutionLayer.h | 19 +- arm_compute/runtime/CL/functions/CLFastCorners.h | 15 +- arm_compute/runtime/CL/functions/CLFill.h | 9 +- arm_compute/runtime/CL/functions/CLFillBorder.h | 11 +- arm_compute/runtime/CL/functions/CLFlattenLayer.h | 9 + arm_compute/runtime/CL/functions/CLFloor.h | 2 +- .../runtime/CL/functions/CLFullyConnectedLayer.h | 41 ++- .../CL/functions/CLFuseBatchNormalization.h | 21 +- arm_compute/runtime/CL/functions/CLGEMM.h | 42 ++- .../runtime/CL/functions/CLGEMMConvolutionLayer.h | 51 +++- .../CL/functions/CLGEMMDeconvolutionLayer.h | 11 + .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h | 19 ++ .../runtime/CL/functions/CLGEMMLowpOutputStage.h | 93 +++++++ arm_compute/runtime/CL/functions/CLGather.h | 9 + arm_compute/runtime/CL/functions/CLGaussian3x3.h | 11 +- arm_compute/runtime/CL/functions/CLGaussian5x5.h | 11 +- .../runtime/CL/functions/CLGaussianPyramid.h | 14 +- .../CL/functions/CLGenerateProposalsLayer.h | 18 ++ arm_compute/runtime/CL/functions/CLHOGDescriptor.h | 13 +- arm_compute/runtime/CL/functions/CLHOGDetector.h | 18 +- arm_compute/runtime/CL/functions/CLHOGGradient.h | 15 +- .../runtime/CL/functions/CLHOGMultiDetection.h | 27 +- arm_compute/runtime/CL/functions/CLHarrisCorners.h | 19 +- arm_compute/runtime/CL/functions/CLHistogram.h | 9 +- .../CL/functions/CLInstanceNormalizationLayer.h | 12 + arm_compute/runtime/CL/functions/CLIntegralImage.h | 9 +- .../runtime/CL/functions/CLL2NormalizeLayer.h | 11 +- arm_compute/runtime/CL/functions/CLLSTMLayer.h | 46 ++++ .../runtime/CL/functions/CLLSTMLayerQuantized.h | 29 ++- .../runtime/CL/functions/CLLaplacianPyramid.h | 15 +- .../runtime/CL/functions/CLLaplacianReconstruct.h | 18 +- .../runtime/CL/functions/CLLocallyConnectedLayer.h | 15 +- arm_compute/runtime/CL/functions/CLMagnitude.h | 11 +- arm_compute/runtime/CL/functions/CLMeanStdDev.h | 10 +- .../CL/functions/CLMeanStdDevNormalizationLayer.h | 12 +- arm_compute/runtime/CL/functions/CLMedian3x3.h | 11 +- .../runtime/CL/functions/CLMinMaxLocation.h | 18 +- .../runtime/CL/functions/CLNonLinearFilter.h | 16 +- .../CL/functions/CLNonMaximaSuppression3x3.h | 14 +- .../runtime/CL/functions/CLNormalizationLayer.h | 13 +- .../CL/functions/CLNormalizePlanarYUVLayer.h | 11 + arm_compute/runtime/CL/functions/CLOpticalFlow.h | 23 +- arm_compute/runtime/CL/functions/CLPReluLayer.h | 10 + arm_compute/runtime/CL/functions/CLPadLayer.h | 13 + arm_compute/runtime/CL/functions/CLPermute.h | 10 + arm_compute/runtime/CL/functions/CLPhase.h | 11 +- .../CL/functions/CLPixelWiseMultiplication.h | 27 ++ arm_compute/runtime/CL/functions/CLPoolingLayer.h | 9 + arm_compute/runtime/CL/functions/CLPriorBoxLayer.h | 11 +- arm_compute/runtime/CL/functions/CLQLSTMLayer.h | 72 +++++- .../runtime/CL/functions/CLQuantizationLayer.h | 9 + arm_compute/runtime/CL/functions/CLRNNLayer.h | 15 +- arm_compute/runtime/CL/functions/CLROIAlignLayer.h | 16 ++ .../runtime/CL/functions/CLROIPoolingLayer.h | 17 +- arm_compute/runtime/CL/functions/CLRange.h | 11 +- arm_compute/runtime/CL/functions/CLReduceMean.h | 11 + .../runtime/CL/functions/CLReductionOperation.h | 10 + arm_compute/runtime/CL/functions/CLRemap.h | 16 +- arm_compute/runtime/CL/functions/CLReorgLayer.h | 12 + arm_compute/runtime/CL/functions/CLReshapeLayer.h | 7 + arm_compute/runtime/CL/functions/CLReverse.h | 8 + arm_compute/runtime/CL/functions/CLScale.h | 15 ++ arm_compute/runtime/CL/functions/CLScharr3x3.h | 14 +- arm_compute/runtime/CL/functions/CLSelect.h | 9 + arm_compute/runtime/CL/functions/CLSlice.h | 14 + arm_compute/runtime/CL/functions/CLSobel3x3.h | 14 +- arm_compute/runtime/CL/functions/CLSobel5x5.h | 14 +- arm_compute/runtime/CL/functions/CLSobel7x7.h | 14 +- arm_compute/runtime/CL/functions/CLSoftmaxLayer.h | 28 +- .../runtime/CL/functions/CLSpaceToBatchLayer.h | 21 ++ .../runtime/CL/functions/CLSpaceToDepthLayer.h | 8 + arm_compute/runtime/CL/functions/CLStackLayer.h | 11 + arm_compute/runtime/CL/functions/CLStridedSlice.h | 18 ++ arm_compute/runtime/CL/functions/CLTableLookup.h | 10 +- arm_compute/runtime/CL/functions/CLThreshold.h | 16 +- arm_compute/runtime/CL/functions/CLTile.h | 8 + arm_compute/runtime/CL/functions/CLTranspose.h | 7 + arm_compute/runtime/CL/functions/CLUnstack.h | 12 +- arm_compute/runtime/CL/functions/CLUpsampleLayer.h | 10 + arm_compute/runtime/CL/functions/CLWarpAffine.h | 15 +- .../runtime/CL/functions/CLWarpPerspective.h | 14 +- .../CL/functions/CLWinogradConvolutionLayer.h | 22 +- .../CL/functions/CLWinogradInputTransform.h | 21 +- arm_compute/runtime/CL/functions/CLYOLOLayer.h | 14 +- src/runtime/CL/functions/CLAbsoluteDifference.cpp | 7 +- src/runtime/CL/functions/CLAccumulate.cpp | 21 +- src/runtime/CL/functions/CLActivationLayer.cpp | 7 +- src/runtime/CL/functions/CLArgMinMaxLayer.cpp | 17 +- .../CL/functions/CLBatchNormalizationLayer.cpp | 11 +- src/runtime/CL/functions/CLBatchToSpaceLayer.cpp | 16 +- src/runtime/CL/functions/CLBitwiseAnd.cpp | 7 +- src/runtime/CL/functions/CLBitwiseNot.cpp | 7 +- src/runtime/CL/functions/CLBitwiseOr.cpp | 7 +- src/runtime/CL/functions/CLBitwiseXor.cpp | 7 +- .../CL/functions/CLBoundingBoxTransform.cpp | 7 +- src/runtime/CL/functions/CLBox3x3.cpp | 9 +- src/runtime/CL/functions/CLCannyEdge.cpp | 21 +- src/runtime/CL/functions/CLCast.cpp | 7 +- src/runtime/CL/functions/CLChannelCombine.cpp | 14 +- src/runtime/CL/functions/CLChannelExtract.cpp | 14 +- src/runtime/CL/functions/CLChannelShuffleLayer.cpp | 7 +- src/runtime/CL/functions/CLColorConvert.cpp | 28 +- src/runtime/CL/functions/CLComparison.cpp | 19 +- src/runtime/CL/functions/CLComputeAllAnchors.cpp | 7 +- src/runtime/CL/functions/CLConcatenateLayer.cpp | 28 +- .../functions/CLConvertFullyConnectedWeights.cpp | 8 +- src/runtime/CL/functions/CLConvolution.cpp | 37 ++- src/runtime/CL/functions/CLConvolutionLayer.cpp | 17 +- src/runtime/CL/functions/CLCopy.cpp | 7 +- src/runtime/CL/functions/CLCropResize.cpp | 282 +++++++++++---------- src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 12 +- .../CL/functions/CLDeconvolutionLayerUpsample.cpp | 11 +- src/runtime/CL/functions/CLDepthConvertLayer.cpp | 7 +- src/runtime/CL/functions/CLDepthToSpaceLayer.cpp | 7 +- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 55 +++- src/runtime/CL/functions/CLDequantizationLayer.cpp | 7 +- src/runtime/CL/functions/CLDerivative.cpp | 9 +- src/runtime/CL/functions/CLDilate.cpp | 9 +- .../CL/functions/CLDirectConvolutionLayer.cpp | 13 +- .../CL/functions/CLDirectDeconvolutionLayer.cpp | 30 ++- .../CL/functions/CLElementWiseUnaryLayer.cpp | 49 +++- .../CL/functions/CLElementwiseOperations.cpp | 67 +++-- src/runtime/CL/functions/CLEqualizeHistogram.cpp | 13 +- src/runtime/CL/functions/CLErode.cpp | 9 +- src/runtime/CL/functions/CLFFT1D.cpp | 13 +- src/runtime/CL/functions/CLFFT2D.cpp | 9 +- src/runtime/CL/functions/CLFFTConvolutionLayer.cpp | 38 +-- src/runtime/CL/functions/CLFastCorners.cpp | 16 +- src/runtime/CL/functions/CLFill.cpp | 10 +- src/runtime/CL/functions/CLFillBorder.cpp | 7 +- src/runtime/CL/functions/CLFlattenLayer.cpp | 7 +- src/runtime/CL/functions/CLFloor.cpp | 2 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 44 ++-- .../CL/functions/CLFuseBatchNormalization.cpp | 12 +- src/runtime/CL/functions/CLGEMM.cpp | 49 ++-- .../CL/functions/CLGEMMConvolutionLayer.cpp | 37 ++- .../CL/functions/CLGEMMDeconvolutionLayer.cpp | 70 ++--- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 30 ++- src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 77 +++++- src/runtime/CL/functions/CLGather.cpp | 7 +- src/runtime/CL/functions/CLGaussian3x3.cpp | 9 +- src/runtime/CL/functions/CLGaussian5x5.cpp | 13 +- src/runtime/CL/functions/CLGaussianPyramid.cpp | 22 +- .../CL/functions/CLGenerateProposalsLayer.cpp | 31 ++- src/runtime/CL/functions/CLHOGDescriptor.cpp | 13 +- src/runtime/CL/functions/CLHOGDetector.cpp | 10 +- src/runtime/CL/functions/CLHOGGradient.cpp | 14 +- src/runtime/CL/functions/CLHOGMultiDetection.cpp | 16 +- src/runtime/CL/functions/CLHarrisCorners.cpp | 21 +- src/runtime/CL/functions/CLHistogram.cpp | 11 +- .../CL/functions/CLInstanceNormalizationLayer.cpp | 7 +- src/runtime/CL/functions/CLIntegralImage.cpp | 11 +- src/runtime/CL/functions/CLL2NormalizeLayer.cpp | 9 +- src/runtime/CL/functions/CLLSTMLayer.cpp | 109 ++++---- src/runtime/CL/functions/CLLSTMLayerQuantized.cpp | 68 +++-- src/runtime/CL/functions/CLLaplacianPyramid.cpp | 13 +- .../CL/functions/CLLaplacianReconstruct.cpp | 13 +- .../CL/functions/CLLocallyConnectedLayer.cpp | 16 +- src/runtime/CL/functions/CLMagnitude.cpp | 7 +- src/runtime/CL/functions/CLMeanStdDev.cpp | 15 +- .../functions/CLMeanStdDevNormalizationLayer.cpp | 7 +- src/runtime/CL/functions/CLMedian3x3.cpp | 9 +- src/runtime/CL/functions/CLMinMaxLocation.cpp | 13 +- src/runtime/CL/functions/CLNonLinearFilter.cpp | 10 +- .../CL/functions/CLNonMaximaSuppression3x3.cpp | 11 +- src/runtime/CL/functions/CLNormalizationLayer.cpp | 11 +- .../CL/functions/CLNormalizePlanarYUVLayer.cpp | 7 +- src/runtime/CL/functions/CLOpticalFlow.cpp | 19 +- src/runtime/CL/functions/CLPReluLayer.cpp | 13 +- src/runtime/CL/functions/CLPadLayer.cpp | 11 +- src/runtime/CL/functions/CLPermute.cpp | 7 +- src/runtime/CL/functions/CLPhase.cpp | 7 +- .../CL/functions/CLPixelWiseMultiplication.cpp | 19 +- src/runtime/CL/functions/CLPoolingLayer.cpp | 9 +- src/runtime/CL/functions/CLPriorBoxLayer.cpp | 9 +- src/runtime/CL/functions/CLQLSTMLayer.cpp | 132 ++++++---- src/runtime/CL/functions/CLQuantizationLayer.cpp | 7 +- src/runtime/CL/functions/CLRNNLayer.cpp | 17 +- src/runtime/CL/functions/CLROIAlignLayer.cpp | 7 +- src/runtime/CL/functions/CLROIPoolingLayer.cpp | 7 +- src/runtime/CL/functions/CLRange.cpp | 7 +- src/runtime/CL/functions/CLReduceMean.cpp | 11 +- src/runtime/CL/functions/CLReductionOperation.cpp | 21 +- src/runtime/CL/functions/CLRemap.cpp | 11 +- src/runtime/CL/functions/CLReorgLayer.cpp | 7 +- src/runtime/CL/functions/CLReshapeLayer.cpp | 7 +- src/runtime/CL/functions/CLReverse.cpp | 7 +- src/runtime/CL/functions/CLScale.cpp | 10 +- src/runtime/CL/functions/CLScharr3x3.cpp | 9 +- src/runtime/CL/functions/CLSelect.cpp | 9 +- src/runtime/CL/functions/CLSlice.cpp | 7 +- src/runtime/CL/functions/CLSobel3x3.cpp | 9 +- src/runtime/CL/functions/CLSobel5x5.cpp | 21 +- src/runtime/CL/functions/CLSobel7x7.cpp | 21 +- src/runtime/CL/functions/CLSoftmaxLayer.cpp | 28 +- src/runtime/CL/functions/CLSpaceToBatchLayer.cpp | 21 +- src/runtime/CL/functions/CLSpaceToDepthLayer.cpp | 9 +- src/runtime/CL/functions/CLStackLayer.cpp | 7 +- src/runtime/CL/functions/CLStridedSlice.cpp | 9 +- src/runtime/CL/functions/CLTableLookup.cpp | 7 +- src/runtime/CL/functions/CLThreshold.cpp | 8 +- src/runtime/CL/functions/CLTile.cpp | 7 +- src/runtime/CL/functions/CLTranspose.cpp | 7 +- src/runtime/CL/functions/CLUnstack.cpp | 9 +- src/runtime/CL/functions/CLUpsampleLayer.cpp | 10 +- src/runtime/CL/functions/CLWarpAffine.cpp | 10 +- src/runtime/CL/functions/CLWarpPerspective.cpp | 10 +- .../CL/functions/CLWinogradConvolutionLayer.cpp | 20 +- .../CL/functions/CLWinogradInputTransform.cpp | 9 +- src/runtime/CL/functions/CLYOLOLayer.cpp | 7 +- tests/validation/fixtures/CropResizeFixture.h | 15 +- 253 files changed, 3822 insertions(+), 856 deletions(-) diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h index 28d3acc043..26aded6def 100644 --- a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h +++ b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ public: * @param[out] output Output tensor. Data types supported: U8, S16 */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 First input tensor. Data types supported: U8, S16 + * @param[in] input2 Second input tensor. Data types supported: U8, S16 + * @param[out] output Output tensor. Data types supported: U8, S16 + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); }; } #endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H */ diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h index f465ab3c46..b47f0c0e4a 100644 --- a/arm_compute/runtime/CL/functions/CLAccumulate.h +++ b/arm_compute/runtime/CL/functions/CLAccumulate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,6 +42,13 @@ public: * @param[out] accum Destination tensor. Data types supported: S16. */ void configure(const ICLTensor *input, ICLTensor *accum); + /** Set the input and accumulation tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: U8. + * @param[out] accum Destination tensor. Data types supported: S16. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum); }; /** Basic function to run @ref CLAccumulateWeightedKernel */ @@ -55,6 +62,14 @@ public: * @param[in,out] accum Accumulated tensor. Data types supported: U8. */ void configure(const ICLTensor *input, float alpha, ICLTensor *accum); + /** Set the input and accumulation tensors, and the scale value. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: U8. + * @param[in] alpha The input scalar value with a value input the range of [0, 1.0]. Data types supported: F32. + * @param[in,out] accum Accumulated tensor. Data types supported: U8. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum); }; /** Basic function to run @ref CLAccumulateSquaredKernel */ @@ -68,6 +83,14 @@ public: * @param[in,out] accum Accumulated tensor. Data types supported: S16. */ void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum); + /** Set the input and accumulation tensors and the shift value. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: U8. + * @param[in] shift The input with a value input the range of [0, 15]. Data types supported: U32. + * @param[in,out] accum Accumulated tensor. Data types supported: S16. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum); }; } #endif /*ARM_COMPUTE_CLACCUMULATE_H */ diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h index 09f5d2bf58..fbb34e5fb9 100644 --- a/arm_compute/runtime/CL/functions/CLActivationLayer.h +++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h @@ -62,6 +62,17 @@ public: * @param[in] act_info Activation layer parameters. */ void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] act_info Activation layer parameters. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h index a26fcfda56..b0d29bcefe 100644 --- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h +++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h @@ -61,6 +61,15 @@ public: * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN */ void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input source tensor. Data types supported: QASYMM8/F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[out] output Output source tensor. Data types supported: U32/S32. + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer * * @param[in] input Input source tensor info. Data types supported: QASYMM8/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h index 7cd4d164d8..a211ea6b04 100644 --- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -61,6 +61,25 @@ public: */ void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + /** Set the input and output tensors. + * + * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. + * 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input + * @param[in] gamma (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input + * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, + const ICLTensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayer * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h index b98702819b..6edb4641fe 100644 --- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h +++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h @@ -46,6 +46,14 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input */ void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. + * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output); /** Set the input and output tensors. (Static block shape). * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -54,6 +62,15 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input */ void configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output); + /** Set the input and output tensors. (Static block shape). + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. + * @param[in] block_shape_x Block shape x value. + * @param[in] block_shape_y Block shape y value. + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h index 77907cc08b..1faded04fe 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ public: * @param[out] output Output tensor. Data types supported: U8. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 Input tensor. Data types supported: U8. + * @param[in] input2 Input tensor. Data types supported: U8. + * @param[out] output Output tensor. Data types supported: U8. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); }; } #endif /* ARM_COMPUTE_CLBITWISEAND_H */ diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h index b5c7cfe5fc..c9460555dd 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,6 +44,13 @@ public: * @param[out] output Output tensor. Data types supported: U8. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: U8. + * @param[out] output Output tensor. Data types supported: U8. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); }; } #endif /* ARM_COMPUTE_CLBITWISENOT_H */ diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h index 5957c3f6a8..4fb93cc8a2 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ public: * @param[out] output Output tensor. Data types supported: U8. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 Input tensor. Data types supported: U8. + * @param[in] input2 Input tensor. Data types supported: U8. + * @param[out] output Output tensor. Data types supported: U8. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); }; } #endif /* ARM_COMPUTE_CLBITWISEOR_H */ diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h index a4e864c0aa..6caa013607 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ public: * @param[out] output Output tensor. Data types supported: U8. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 Input tensor. Data types supported: U8. + * @param[in] input2 Input tensor. Data types supported: U8. + * @param[out] output Output tensor. Data types supported: U8. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); }; } #endif /* ARM_COMPUTE_CLBITWISEXOR_H */ diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h index 3e11781827..b09359dfc2 100644 --- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h +++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,18 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. */ void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] boxes Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32. + * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input + * @param[in] deltas Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K is the number of classes. + * Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input + * @param[in] info Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo. + * + * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform * diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h index 3fb18e3270..a4cf4d296b 100644 --- a/arm_compute/runtime/CL/functions/CLBox3x3.h +++ b/arm_compute/runtime/CL/functions/CLBox3x3.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,15 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLBOX3X3_H */ diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h index 1a5676795f..2729d241a9 100644 --- a/arm_compute/runtime/CL/functions/CLCannyEdge.h +++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -69,6 +69,20 @@ public: */ void configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED) + * @param[out] output Destination tensor. Data types supported: U8. + * @param[in] upper_thr Upper threshold used for the hysteresis. + * @param[in] lower_thr Lower threshold used for the hysteresis. + * @param[in] gradient_size Gradient size (3, 5 or 7). + * @param[in] norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, + uint8_t constant_border_value = 0); // Inherited methods overridden: virtual void run() override; diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h index 4cb1fe0bb5..6a1835c73a 100644 --- a/arm_compute/runtime/CL/functions/CLCast.h +++ b/arm_compute/runtime/CL/functions/CLCast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -56,6 +56,26 @@ public: * @param[in] policy Conversion policy. */ void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy); + /** Initialize the function's source, destination + * + * Input data type must be different than output data type. + * + * Valid conversions Input -> Output : + * + * - U8 -> S8, U16, S16, U32, S32, F16, F32 + * - U16 -> U8, S8, S16, U32, S32, F16, F32 + * - S16 -> U8, S8, U16, U32, S32, F16, F32 + * - U32 -> U8, S8, U16, S16, S32, F16, F32 + * - S32 -> U8, S8, U16, S16, U32, F16, F32 + * - F16 -> U8, S8, U16, S16, U32, F32 + * - F32 -> U8, S8, U16, S16, U32, F16 + * + * @param[in] compile_context The compile context to be used. + * @param[in] input The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] policy Conversion policy. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref CLCast * * @param[in] input Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h index 25f31d86d1..474830d7af 100644 --- a/arm_compute/runtime/CL/functions/CLChannelCombine.h +++ b/arm_compute/runtime/CL/functions/CLChannelCombine.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,16 @@ public: * @param[out] output The single planar output tensor. */ void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output); + /** Initialize function's inputs and outputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in] plane0 The 2D plane that forms channel 0. Must be of U8 format. + * @param[in] plane1 The 2D plane that forms channel 1. Must be of U8 format. + * @param[in] plane2 The 2D plane that forms channel 2. Must be of U8 format. + * @param[in] plane3 The 2D plane that forms channel 3. Must be of U8 format. + * @param[out] output The single planar output tensor. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output); /** Initialize function's inputs and outputs. * * @param[in] plane0 The 2D plane that forms channel 0. Must be of U8 format. @@ -53,6 +63,15 @@ public: * @param[out] output The multi planar output image. */ void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output); + /** Initialize function's inputs and outputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in] plane0 The 2D plane that forms channel 0. Must be of U8 format. + * @param[in] plane1 The 2D plane that forms channel 1. Must be of U8 format. + * @param[in] plane2 The 2D plane that forms channel 2. Must be of U8 format. + * @param[out] output The multi planar output image. + */ + void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output); }; } #endif /*ARM_COMPUTE_CLCHANNELCOMBINE_H*/ diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h index 77d84b968c..aa25516e18 100644 --- a/arm_compute/runtime/CL/functions/CLChannelExtract.h +++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,6 +44,14 @@ public: * @param[out] output The extracted channel. Must be of U8 format. */ void configure(const ICLTensor *input, Channel channel, ICLTensor *output); + /** Initialize the function's source, destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] input The input tensor to extract the channel from. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422 + * @param[in] channel The channel to extract. + * @param[out] output The extracted channel. Must be of U8 format. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output); /** Initialize the function's source, destination * * @param[in] input The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444 @@ -51,6 +59,14 @@ public: * @param[out] output The extracted 2D channel. Must be of U8 format. */ void configure(const ICLMultiImage *input, Channel channel, ICLImage *output); + /** Initialize the function's source, destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] input The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444 + * @param[in] channel The channel to extract. + * @param[out] output The extracted 2D channel. Must be of U8 format. + */ + void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output); }; } #endif /*ARM_COMPUTE_CLCHANNELEXTRACT_H*/ diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h index 6e30bd3ebd..183a2f1ea6 100644 --- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h +++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h @@ -46,6 +46,14 @@ public: * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. */ void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups); /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel * * @param[in] input Input tensor info. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h index 1a3bea9cd3..8721e8afa1 100644 --- a/arm_compute/runtime/CL/functions/CLColorConvert.h +++ b/arm_compute/runtime/CL/functions/CLColorConvert.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -47,24 +47,54 @@ public: * U8 (if the formats of @p input is RGB888) */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function's source, destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888 + * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422), + * RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/), + * U8 (if the formats of @p input is RGB888) + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Initialize the function's source, destination * * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 */ void configure(const ICLMultiImage *input, ICLImage *output); + /** Initialize the function's source, destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 + */ + void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output); /** Initialize the function's source, destination * * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) */ void configure(const ICLImage *input, ICLMultiImage *output); + /** Initialize the function's source, destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 + * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) + */ + void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output); /** Initialize the function's source, destination * * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) */ void configure(const ICLMultiImage *input, ICLMultiImage *output); + /** Initialize the function's source, destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) + */ + void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output); }; } #endif /* ARM_COMPUTE_CLCOLORCONVERT_H */ diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h index 85dbe7129d..4e681e73a7 100644 --- a/arm_compute/runtime/CL/functions/CLComparison.h +++ b/arm_compute/runtime/CL/functions/CLComparison.h @@ -46,6 +46,17 @@ public: * @param[out] operation Comparison operation to be used. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation); + /** Initialise the kernel's inputs and outputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 Source tensor. Data types supported: All. + * The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in] input2 Source tensor. Data types supported: Same as @p input1. + * The input2 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Destination tensor. Data types supported: U8. + * @param[out] operation Comparison operation to be used. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation); /** Static function to check if given info will lead to a valid configuration of @ref CLComparison * * @param[in] input1 Source tensor. Data types supported: All. @@ -75,6 +86,19 @@ public: * @param[out] output Destination tensor. Data types supported: U8. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); + /** Comparison operations used by the class */ + +public: + /** Initialise the kernel's inputs and outputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. + * The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in] input2 Source tensor. Data types supported: Same as @p input1. + * The input2 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Destination tensor. Data types supported: U8. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLComparison * * @param[in] input1 Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h index a039320c4e..15c5bfeb7d 100644 --- a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h +++ b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -47,6 +47,15 @@ public: * */ void configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] anchors Source tensor. Original set of anchors of size (4, A) where A is the number of anchors. Data types supported: F16/F32 + * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input + * @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo + * + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel * diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h index c3d065a2ba..b8e3361e9e 100644 --- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h +++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h @@ -62,6 +62,18 @@ public: */ void configure(std::vector &inputs_vector, ICLTensor *output, size_t axis); void configure(std::vector &inputs_vector, ICLTensor *output, size_t axis); + /** Initialise the kernel's inputs vector and output. + * + * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. + * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. + */ + void configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis); + void configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis); /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer * * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. @@ -81,7 +93,7 @@ public: private: template - void configure_internal(std::vector &&inputs_vector, ICLTensor *output, size_t axis); + void configure_internal(const CLCompileContext &compile_context, std::vector &&inputs_vector, ICLTensor *output, size_t axis); template static Status validate_internal(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h index 76a28ed6fe..123f6380bb 100644 --- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h +++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h @@ -47,6 +47,17 @@ public: * @return A status */ void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout); + /** Initialize the function. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: All. + * @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input. + * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + * + * @return A status + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeights * * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. @@ -96,7 +107,18 @@ public: */ void configure(const ICLTensor *input, const TensorShape &original_input_shape, DataLayout data_layout) { - _func.configure(input, &_output, original_input_shape, data_layout); + configure(CLKernelLibrary::get().get_compile_context(), input, original_input_shape, data_layout); + } + /** Configures the @ref CLConvertFullyConnectedWeights function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source weights tensor info to convert. Data type supported: All. + * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const TensorShape &original_input_shape, DataLayout data_layout) + { + _func.configure(compile_context, input, &_output, original_input_shape, data_layout); } private: diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h index 43507d7cbc..72ef8ce7b8 100644 --- a/arm_compute/runtime/CL/functions/CLConvolution.h +++ b/arm_compute/runtime/CL/functions/CLConvolution.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -59,6 +59,17 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialize the function's source, destination, conv and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8 or S16. + * @param[in] conv matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer. + * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0); }; /** Basic function to execute square convolution.Currently it supports 5x5, 7x7, 9x9. This function calls the following OpenCL kernels: @@ -84,6 +95,17 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialize the function's source, destination, conv and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8 or S16. + * @param[in] conv matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer. + * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited methods overriden: void run() override; @@ -127,6 +149,20 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialize the function's source, destination, conv and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8 or S16. + * @param[in] conv Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer. + * @param[in] rows Rows of convolution kernel. + * @param[in] cols Columns of convolution kernel. + * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, + uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLCONVOLUTION_H */ diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h index b52695463a..fff9173210 100644 --- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h @@ -94,6 +94,29 @@ public: */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h index 9252ac3c57..31b73c33c3 100644 --- a/arm_compute/runtime/CL/functions/CLCopy.h +++ b/arm_compute/runtime/CL/functions/CLCopy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,6 +43,14 @@ public: * */ void configure(ICLTensor *input, ICLTensor *output); + /** Initialise the function's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLCopy * * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h index 244e345b03..86df0d46d1 100644 --- a/arm_compute/runtime/CL/functions/CLCropResize.h +++ b/arm_compute/runtime/CL/functions/CLCropResize.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -62,10 +62,10 @@ public: * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used. * @note Start and end indices of boxes are inclusive. * - * @param[in] input Source tensor containing N batches of 3D images to be cropped. Data type supported: F32 - * @param[in] boxes Tensor containing the boxes used to crop the images. Data type supported: F32 + * @param[in] input Source tensor containing N batches of 3D images to be cropped. Data type supported: : U16/S16/U32/S32/F16/F32 + * @param[in] boxes Tensor containing the boxes used to crop the images. It has to be known before configuration. Data type supported: F32 * @param[in] box_ind One dimensional tensor containing the batch index of the 3D image in @p input that the corresponding - * box in @p boxes will be applied to. Data type supported: F32 + * box in @p boxes will be applied to. It has to be known before configuration. Data type supported: F32 * @param[out] output Destination tensor containing a cropped and resized image for each box in @p boxes. Data type supported: F32 * @param[in] crop_size The dimensions that each cropped image will be resized to. * @param[in] method The policy to be used when resizing image. Default is bilinear. @@ -73,6 +73,24 @@ public: */ void configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used. + * @note Start and end indices of boxes are inclusive. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor containing N batches of 3D images to be cropped. Data type supported: U16/S16/U32/S32/F16/F32 + * @param[in] boxes Tensor containing the boxes used to crop the images. It has to be known before configuration. Data type supported: F32 + * @param[in] box_ind One dimensional tensor containing the batch index of the 3D image in @p input that the corresponding + * box in @p boxes will be applied to. It has to be known before configuration. Data type supported: F32 + * @param[out] output Destination tensor containing a cropped and resized image for each box in @p boxes. Data type supported: F32 + * @param[in] crop_size The dimensions that each cropped image will be resized to. + * @param[in] method The policy to be used when resizing image. Default is bilinear. + * @param[in] extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, + InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0); /** Static function to check if given info will lead to a valid configuration of @ref NESlice * @@ -109,6 +127,8 @@ public: std::vector> _copy; std::vector> _crop_results; std::vector> _scaled_results; + + std::vector> _internal_kernels; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CL_CROP_RESIZE_H */ diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h index 78c149d933..c75b586132 100644 --- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h @@ -55,6 +55,19 @@ public: * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer * * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h index 5a1009c79f..2d3dde1ea0 100644 --- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h +++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h @@ -64,6 +64,14 @@ public: * @param[in] info Contains padding and policies to be used in the deconvolution. */ void configure(ICLTensor *input, ICLTensor *output, const PadStrideInfo &info); + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. Data type supported: All. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample * * @param[in] input Source tensor info. Data type supported: All. diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h index 1b9476c3a5..910b9eac51 100644 --- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -57,6 +57,27 @@ public: * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8. */ void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift); + /** Initialize the function's source, destination + * + * Input data type must be different than output data type. + * + * Valid conversions Input -> Output : + * + * - U8 -> S8, U16, S16, U32, S32, F16, F32 + * - U16 -> U8, S8, S16, U32, S32, F16, F32 + * - S16 -> U8, S8, U16, U32, S32, F16, F32 + * - U32 -> U8, S8, U16, S16, S32, F16, F32 + * - S32 -> U8, S8, U16, S16, U32, F16, F32 + * - F16 -> U8, S8, U16, S16, U32, F32 + * - F32 -> U8, S8, U16, S16, U32, F16 + * + * @param[in] compile_context The compile context to be used. + * @param[in] input The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] policy Conversion policy. + * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayer * * @param[in] input Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h index 0c33ed34be..dbf5898319 100644 --- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h @@ -42,6 +42,14 @@ public: * @param[in] block_shape Block shape value. */ void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayer. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index 4668e82bab..63c359e68c 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -70,6 +70,22 @@ public: */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer * @@ -150,6 +166,22 @@ private: */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + /** Initialize the function's source, destination, conv and border_size. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3 * @@ -234,6 +266,22 @@ private: */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerGeneric * @@ -328,6 +376,23 @@ public: ARM_COMPUTE_DEPRECATED_REL_REPLACE(20.02, CLDepthwiseConvolutionLayer) void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + /** Initialize the function's source, destination, conv and border_size. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + */ + ARM_COMPUTE_DEPRECATED_REL_REPLACE(20.02, CLDepthwiseConvolutionLayer) + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3 * diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h index 48d6ba8435..c0a0fcd988 100644 --- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h @@ -44,6 +44,14 @@ public: * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. + * Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayer * * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h index 1155d401ee..5875ceb86d 100644 --- a/arm_compute/runtime/CL/functions/CLDerivative.h +++ b/arm_compute/runtime/CL/functions/CLDerivative.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,19 @@ public: * */ void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @note At least one of output_x or output_y must be not NULL. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output_x (optional) Destination tensor. Derivative along the X direction. Data types supported: S16. + * @param[out] output_y (optional) Destination tensor. Derivative along the Y direction. Data types supported: S16. + * @param[in] border_mode Border mode to use + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /* ARM_COMPUTE_CLDERIVATIVE_H */ diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h index ceea4567b2..cc84820f9f 100644 --- a/arm_compute/runtime/CL/functions/CLDilate.h +++ b/arm_compute/runtime/CL/functions/CLDilate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,15 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the kernel's inputs, output and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Output tensor. Data types supported: U8. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLDILATE_H */ diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h index 045b1c0c99..0c81ffa460 100644 --- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h @@ -57,6 +57,22 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayer * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h index 6632bfce80..1fed460e69 100644 --- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h @@ -98,6 +98,21 @@ public: * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLDirectDeconvolutionLayer * * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. diff --git a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h index e2503f7bdf..19729b61cc 100644 --- a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h +++ b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,6 +40,13 @@ public: * @param[out] output Output tensor. Data types supported: same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLRsqrtLayer * * @param[in] input First tensor input info. Data types supported: F16/F32. @@ -60,6 +67,13 @@ public: * @param[out] output Output tensor. Data types supported: same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLExpLayer * * @param[in] input First tensor input info. Data types supported: F16/F32. @@ -80,6 +94,13 @@ public: * @param[out] output Output tensor. Data types supported: same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLNegLayer * * @param[in] input First tensor input info. Data types supported: F16/F32. @@ -100,6 +121,13 @@ public: * @param[out] output Output tensor. Data types supported: same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSinLayer * * @param[in] input First tensor input info. Data types supported: F16/F32. @@ -120,6 +148,13 @@ public: * @param[out] output Output tensor. Data types supported: same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLLogLayer * * @param[in] input First tensor input info. Data types supported: F16/F32. @@ -140,6 +175,13 @@ public: * @param[out] output Output tensor. Data types supported: same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLAbsLayer * * @param[in] input First tensor input info. Data types supported: F16/F32. @@ -160,6 +202,13 @@ public: * @param[out] output Output tensor. Data types supported: same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLRoundLayer * * @param[in] input First tensor input info. Data types supported: F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h index 6d9f3a0e97..8c656ed8bc 100644 --- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h +++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h @@ -50,6 +50,18 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QSYMM16 (only if @p input1 is QSYMM16), S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), QSYMM16 (only if both inputs is QSYMM16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. @@ -82,6 +94,18 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32. @@ -113,6 +137,17 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision * * @param[in] input1 First tensor input info. Data types supported: F16/F32. @@ -143,6 +178,17 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for max * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. @@ -173,6 +219,17 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for min * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. @@ -203,6 +260,17 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for squared difference * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32. @@ -233,6 +301,17 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported:F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for power * * @param[in] input1 First tensor input info. Data types supported: F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h index 79c18fae9f..d907cfb092 100644 --- a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h +++ b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,13 @@ public: * @param[out] output Output of same data type with equalized brightness and contrast. */ void configure(const ICLImage *input, ICLImage *output); + /** Initialise the kernel's inputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input image. Data types supported: U8. + * @param[out] output Output of same data type with equalized brightness and contrast. + */ + void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h index a438f4e114..57f701cce2 100644 --- a/arm_compute/runtime/CL/functions/CLErode.h +++ b/arm_compute/runtime/CL/functions/CLErode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,15 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the kernel's inputs, output and border mode + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Output tensor. Data types supported: U8. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLERODE_H */ diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h index 31e57e13c6..da153225c8 100644 --- a/arm_compute/runtime/CL/functions/CLFFT1D.h +++ b/arm_compute/runtime/CL/functions/CLFFT1D.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -56,6 +56,14 @@ public: * @param[in] config FFT related configuration */ void configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * @param[in] config FFT related configuration + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D. * * @param[in] input Source tensor info. Data types supported: F32. diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h index d34528b9cf..a113f20f91 100644 --- a/arm_compute/runtime/CL/functions/CLFFT2D.h +++ b/arm_compute/runtime/CL/functions/CLFFT2D.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -53,6 +53,14 @@ public: * @param[in] config FFT related configuration */ void configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * @param[in] config FFT related configuration + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFT2D. * * @param[in] input Source tensor info. Data types supported: F32. diff --git a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h index 34bb93ab54..740731950e 100644 --- a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -85,6 +85,23 @@ public: */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Set the input and output tensors. + * + * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTConvolutionLayer * * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h index 2a0e0104b8..1dc87d6a38 100644 --- a/arm_compute/runtime/CL/functions/CLFastCorners.h +++ b/arm_compute/runtime/CL/functions/CLFastCorners.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -71,6 +71,19 @@ public: */ void configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialize the function's source, destination, conv and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source image. Data types supported: U8. + * @param[in] threshold Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3. + * @param[in] nonmax_suppression If true, non-maximum suppression is applied to detected corners before being placed in the array. + * @param[out] corners Array of keypoints to store the results. + * @param[in,out] num_corners Record number of corners in the array + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners, + BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h index c4ba257753..bb1216054f 100644 --- a/arm_compute/runtime/CL/functions/CLFill.h +++ b/arm_compute/runtime/CL/functions/CLFill.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,6 +42,13 @@ public: * @param[in] constant_value Constant value to use to fill tensor. */ void configure(ICLTensor *tensor, PixelValue constant_value); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] constant_value Constant value to use to fill tensor. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value); }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLFILL_H */ diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h index ded79e5cb6..250806b1d7 100644 --- a/arm_compute/runtime/CL/functions/CLFillBorder.h +++ b/arm_compute/runtime/CL/functions/CLFillBorder.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,6 +44,15 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Source tensor. Data types supported: U8/S16 + * @param[in] border_width The border width + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); }; } #endif /*ARM_COMPUTE_FILLBORDER_H */ diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h index b9ce236309..98cf49af48 100644 --- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h +++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h @@ -47,6 +47,15 @@ public: * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input First input tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: All. + * @param[out] output Output tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLFlattenLayer * * @param[in] input First input tensor to flatten with at least 3 dimensions. diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h index c4a893fdeb..2844a5642b 100644 --- a/arm_compute/runtime/CL/functions/CLFloor.h +++ b/arm_compute/runtime/CL/functions/CLFloor.h @@ -48,7 +48,7 @@ public: * @param[in] input Source tensor. Data type supported: F16/F32. * @param[out] output Destination tensor. Same as @p input */ - void configure(CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLFloor * * @param[in] input Source tensor info. Data type supported: F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index cbd28603fc..188117f674 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -52,6 +52,13 @@ public: * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayerReshapeWeights * * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -100,7 +107,16 @@ public: */ void configure(const ICLTensor *input) { - _func.configure(input, &_output); + configure(CLKernelLibrary::get().get_compile_context(), input); + } + /** Configures the @ref CLFullyConnectedLayerReshapeWeights function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input) + { + _func.configure(compile_context, input, &_output); } private: @@ -147,6 +163,23 @@ public: */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer * * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -171,9 +204,9 @@ public: void prepare() override; private: - void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); - void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); - void configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); MemoryGroup _memory_group; IWeightsManager *_weights_manager; diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h index 650d2e528b..9057440fc6 100644 --- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h +++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -67,6 +67,25 @@ public: void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC + * @param[in] bn_mean Batch normalization layer mean tensor. Same as @p input_weights + * @param[in] bn_var Batch normalization layer variance tensor. Same as @p input_weights + * @param[out] fused_weights Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights + * @param[out] fused_bias Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights + * @param[in] input_bias (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights + * @param[in] bn_beta (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights + * @note if nullptr, bn_beta is set to 0.0 + * @param[in] bn_gamma (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights + * @note if nullptr, bn_gamma is set to 1.0 + * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. + * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to Convolution. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, + const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, + float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalization * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h index 7a4f12043e..f5588112ae 100644 --- a/arm_compute/runtime/CL/functions/CLGEMM.h +++ b/arm_compute/runtime/CL/functions/CLGEMM.h @@ -79,7 +79,18 @@ public: */ void configure(const ICLTensor *input, GEMMRHSMatrixInfo info) { - _kernel.configure(input, &_output, info); + configure(CLKernelLibrary::get().get_compile_context(), input, info); + } + + /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] info RHS matrix information to be used for reshaping. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info) + { + _kernel.configure(compile_context, input, &_output, info); } private: @@ -134,6 +145,26 @@ public: * in case matrix A and matrix B have been already transformed. */ void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + /** Initialise the kernel's inputs and output + * + * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. + * + * @note All tensors must have the same data type. + * + * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix + * + * @param[in] compile_context The compile context to be used. + * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F16/F32 + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a. + * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. + * @param[out] output Output tensor. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping + * in case matrix A and matrix B have been already transformed. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMM. * * @param[in] a First input tensor info (Matrix or Vector A). Data types supported: F16/F32 @@ -156,10 +187,11 @@ public: private: static CLGEMMKernelType select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run); - void configure_native_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, + const GEMMInfo &gemm_info); static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h index 4952029c9d..6d1181eefe 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h @@ -62,6 +62,16 @@ public: * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout */ void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. + * @param[out] output Destination tensor. Data types supported: Same as @p weights. + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayerReshapeWeights * * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. @@ -93,10 +103,21 @@ public: * @param[in] num_groups Number of groups when performing a grouped convolution. */ void configure(const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups) + { + configure(CLKernelLibrary::get().get_compile_context(), input, biases, num_groups); + } + /** Configures the @ref CLConvolutionLayerReshapeWeights function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] biases Biases tensor. Data type supported: Same as @p input. + * @param[in] num_groups Number of groups when performing a grouped convolution. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups) { _bias_bit = (biases != nullptr) ? 1 : 0; _num_groups = num_groups; - _func.configure(input, biases, &_output, num_groups); + _func.configure(compile_context, input, biases, &_output, num_groups); } //Inherited method override @@ -178,6 +199,28 @@ public: */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -208,6 +251,7 @@ public: private: /** Configures the appropriate matrix multiply routine * + * @param[in] compile_context The compile context to be used. * @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32. * @param[in] weights Weights tensor. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. @@ -218,8 +262,9 @@ private: * @param[in] gemm_3d_depth Depth of GEMM 3D * @param[in] act_info Activation to apply after the matrix multiplication */ - void configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, - const ActivationLayerInfo &act_info); + void configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines * * @param[in] input Input tensor info. Data types supported: QASYMM8/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h index 01687b69ec..d8710a461f 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h @@ -99,6 +99,17 @@ public: * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. Data layout supported: NHWC + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input. Data layout supported: same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as @p input. Data layout supported: same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer * * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h index 1d7013d328..6ac3cefb76 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -72,6 +72,25 @@ public: * if the reshape of matrix B should be executed only for the first run */ void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo()); + /** Initialise the kernel's inputs, output + * + * @note GEMMLowp: low precision GEMM kernel. [A * B + C] + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 and add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE + * + * @param[in] compile_context The compile context to be used. + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32 + * @param[out] output Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore * * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8. diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h index 4c11e51950..06cb759b16 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h @@ -75,6 +75,23 @@ public: ARM_COMPUTE_DEPRECATED_REL(20.05) void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); + /** Initialise the kernel's inputs, output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. + */ + ARM_COMPUTE_DEPRECATED_REL(20.05) + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, + int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale * * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 @@ -137,6 +154,23 @@ public: */ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); + /** Initialise the kernel's inputs, output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: QASYMM8 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, + int result_offset_after_shift, + int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint * * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 @@ -198,6 +232,23 @@ public: */ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); + /** Initialise the kernel's inputs, output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, + int result_offset_after_shift, + int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint * * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 @@ -240,6 +291,23 @@ public: ARM_COMPUTE_DEPRECATED_REL(20.05) void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); + /** Initialise the kernel's inputs, output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: QASYMM8 + * @param[in] multiplier Float multiplier to be multiplied to each element of the input matrix + * @param[in] offset Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. + */ + ARM_COMPUTE_DEPRECATED_REL(20.05) + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, + int min = std::numeric_limits::lowest(), + int max = std::numeric_limits::max()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint * * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 @@ -300,6 +368,21 @@ public: */ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); + /** Initialise the kernel's inputs, output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: QSYMM16 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, + int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint * * @param[in] input Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 @@ -336,6 +419,16 @@ public: * @param[in] info GEMMLowp output stage metadata. */ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info); + /** Initialise the kernel's inputs, output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] info GEMMLowp output stage metadata. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel * * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h index 71843aa42a..dcd9efc6e0 100644 --- a/arm_compute/runtime/CL/functions/CLGather.h +++ b/arm_compute/runtime/CL/functions/CLGather.h @@ -43,6 +43,15 @@ public: * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 */ void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All. + * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel * diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h index 2caf6c9d74..f1906cde92 100644 --- a/arm_compute/runtime/CL/functions/CLGaussian3x3.h +++ b/arm_compute/runtime/CL/functions/CLGaussian3x3.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,15 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLGAUSSIAN3X3_H */ diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h index 5d121a4488..d4ed772342 100644 --- a/arm_compute/runtime/CL/functions/CLGaussian5x5.h +++ b/arm_compute/runtime/CL/functions/CLGaussian5x5.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -62,6 +62,15 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h index aa90a5d4e3..a75a4d1028 100644 --- a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h +++ b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -65,6 +65,16 @@ public: * */ virtual void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0; + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] pyramid Destination pyramid tensors, Data types supported at each level: U8. + * @param[in] border_mode Border mode to use. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * + */ + virtual void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0; protected: ICLTensor *_input; @@ -86,6 +96,7 @@ public: // Inherited methods overridden: void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override; + void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override; void run() override; private: @@ -109,6 +120,7 @@ public: // Inherited methods overridden: void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override; + void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override; void run() override; private: diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h index fb6967f7e5..91b30fabcb 100644 --- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h +++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h @@ -85,6 +85,24 @@ public: */ void configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] scores Scores from convolution layer of size (W, H, A), where H and W are the height and width of the feature map, and A is the number of anchors. + * Data types supported: QASYMM8/F16/F32 + * @param[in] deltas Bounding box deltas from convolution layer of size (W, H, 4*A). Data types supported: Same as @p scores + * @param[in] anchors Anchors tensor of size (4, A). Data types supported: QSYMM16 with scale of 0.125 if @p scores is QASYMM8, otherwise same as @p scores + * @param[out] proposals Box proposals output tensor of size (5, W*H*A). + * Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p scores is QASYMM8, otherwise same as @p scores + * @param[out] scores_out Box scores output tensor of size (W*H*A). Data types supported: Same as @p scores + * @param[out] num_valid_proposals Scalar output tensor which says which of the first proposals are valid. Data types supported: U32 + * @param[in] info Contains GenerateProposals operation information described in @ref GenerateProposalsInfo + * + * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct. + * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, + ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLGenerateProposalsLayer * diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h index 3214e8c3f1..71280c898a 100644 --- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h +++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -59,6 +59,17 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destination, HOG data-object and border mode + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Input tensor. Data type supported: U8 + * (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block + * @param[in] hog HOG data object which describes the HOG descriptor + * @param[in] border_mode Border mode to use. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited method overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h index 6703de9f35..c2bdc15b35 100644 --- a/arm_compute/runtime/CL/functions/CLHOGDetector.h +++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -64,6 +64,22 @@ public: * @param[in] idx_class (Optional) Index of the class used for evaluating which class the detection window belongs to */ void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0); + /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class + * + * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. It is the output of @ref CLHOGDescriptor. Data type supported: F32 + * @param[in] hog HOG data-object that describes the HOG descriptor + * @param[out] detection_windows Array of @ref DetectionWindow used to store the detected objects + * @param[in] detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions. + * It must be multiple of the block stride stored in hog + * @param[in] threshold (Optional) Threshold for the distance between features and SVM classifying plane + * @param[in] idx_class (Optional) Index of the class used for evaluating which class the detection window belongs to + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, + float threshold = 0.0f, + size_t idx_class = 0); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h index ec4a187864..450a4a6045 100644 --- a/arm_compute/runtime/CL/functions/CLHOGGradient.h +++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -61,6 +61,19 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations, phase type and border mode + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Input tensor. Data type supported: U8. + * (Written to only for @p border_mode != UNDEFINED) + * @param[out] output_magnitude Output tensor (magnitude). Data type supported: U16. + * @param[out] output_phase Output tensor.(phase). Format supported: U8 + * @param[in] phase_type Type of @ref PhaseType + * @param[in] border_mode Border mode to use + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, + uint8_t constant_border_value = 0); // Inherited method overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h index 424c69dad8..3d22ff69ee 100644 --- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h +++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -82,8 +82,29 @@ public: * */ void configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode, - uint8_t constant_border_value = 0, - float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f); + uint8_t constant_border_value = 0, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f); + /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Input tensor. Data type supported: U8 + * (Written to only for @p border_mode != UNDEFINED) + * @param[in] multi_hog Container of multiple HOG data object. Each HOG data object describes one HOG model to detect. + * This container should store the HOG data-objects in descending or ascending cell_size width order. + * This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects + * @param[out] detection_windows Array of @ref DetectionWindow used for locating the detected objects + * @param[in] detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object + * The dimension of this array must be the same of multi_hog->num_models() + * The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array + * @param[in] border_mode Border mode to use. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * @param[in] threshold (Optional) Threshold for the distance between features and SVM classifying plane + * @param[in] non_maxima_suppression (Optional) Flag to specify whether the non-maxima suppression is required or not. + * True if the non-maxima suppression stage has to be computed + * @param[in] min_distance (Optional) Radial Euclidean distance to use for the non-maxima suppression stage + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, + BorderMode border_mode, uint8_t constant_border_value = 0, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f); // Inherited method overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h index 6c89d6dea6..2d0e78b00e 100644 --- a/arm_compute/runtime/CL/functions/CLHarrisCorners.h +++ b/arm_compute/runtime/CL/functions/CLHarrisCorners.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -82,6 +82,23 @@ public: void configure(ICLImage *input, float threshold, float min_dist, float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false); + /** Initialize the function's source, destination, conv and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[in] threshold Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel). + * @param[in] min_dist Radial Euclidean distance for the euclidean distance stage. + * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation + * @param[in] gradient_size The gradient window size to use on the input. The implementation supports 3, 5, and 7 + * @param[in] block_size The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7. + * @param[out] corners Array of keypoints to store the results. + * @param[in] border_mode Border mode to use + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * @param[in] use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used. + */ + void configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist, float sensitivity, + int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, + BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h index ad389248f7..6d34dd7060 100644 --- a/arm_compute/runtime/CL/functions/CLHistogram.h +++ b/arm_compute/runtime/CL/functions/CLHistogram.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,6 +55,13 @@ public: * @param[out] output Output distribution. */ void configure(const ICLImage *input, ICLDistribution1D *output); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source image. Data types supported: U8 + * @param[out] output Output distribution. + */ + void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h index ddd4b12eca..4614b90c70 100644 --- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h @@ -51,6 +51,18 @@ public: * @param[in] use_mixed_precision (Optional) Use mixed precision in case of FP16 execution */ void configure(ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * @param[in] use_mixed_precision (Optional) Use mixed precision in case of FP16 execution + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true); /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer. * diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h index 2a452a97a3..1ea189bf33 100644 --- a/arm_compute/runtime/CL/functions/CLIntegralImage.h +++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -48,6 +48,13 @@ public: * @param[out] output Destination tensor, Data types supported: U32. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: U8. + * @param[out] output Destination tensor, Data types supported: U32. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h index e200dc758e..91c547b2cc 100644 --- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h +++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -59,6 +59,15 @@ public: * @param[in] epsilon (Optional) Lower bound value for the normalization. */ void configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2 + * @param[in] epsilon (Optional) Lower bound value for the normalization. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f); /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayer. * diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h index a94f239472..a29513aaae 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h @@ -102,6 +102,52 @@ public: const ICLTensor *output_state_in, const ICLTensor *cell_state_in, ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f); + /** Initialize function's tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32. + * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input. + * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input. + * @param[in] input_to_output_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input. + * @param[in] recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: Same as @p input. + * @param[in] recurrent_to_cell_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: Same as @p input. + * @param[in] recurrent_to_output_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: Same as @p input. + * @param[in] forget_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * @param[in] cell_bias 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * @param[in] output_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * @param[in] output_state_in 2D weights tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input. + * @param[in] cell_state_in 2D tensor with dimensions [num_units, batch_size]. Data type supported: Same as @p input. + * @param[out] scratch_buffer 2D tensor with dimensions [num_units * 4, batch_size] with CIFG or [num_units * 3, batch_size] without CIGF. Data type supported: Same as @p input. + * @param[out] output_state_out 2D weights tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input. + * @param[out] cell_state_out 2D tensor with dimensions [num_units, batch_size]. Data type supported: Same as @p input. + * @param[out] output Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. + * Data types supported: Same as @p input. + * @param[in] lstm_params Weights tensors used in peephole optimization: + * input_to_input_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input. + * recurrent_to_input_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: Same as @p input. + * cell_to_input_weights 1D weights tensor with dimensions [num_units]. Can be nullptr. Data type supported: Same as @p input. + * cell_to_forget_weights 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * cell_to_output_weights 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * input_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input + * projection_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: Same as @p input. + * projection_bias 1D weights tensor with dimensions [output_size]. Data type supported: Same as @p input. + * input_layer_norm_weights 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * forget_layer_norm_weights 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * cell_layer_norm_weights 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * output_layer_norm_weights 1D weights tensor with dimensions [num_units]. Data type supported: Same as @p input. + * @param[in] activation_info Contains activation information described in @ref ActivationLayerInfo. + * @param[in] cell_threshold (Optional) The clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip]. + * If set to 0.0f then clipping is disabled. + * @param[in] projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip]. + * If set to 0.0f then clipping is disabled. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, + const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, const ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, + const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f); /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayer * diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h index 1d39060088..082fdb4499 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -97,6 +97,33 @@ public: const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, ICLTensor *cell_state_in, const ICLTensor *output_state_in, ICLTensor *cell_state_out, ICLTensor *output_state_out); + /** Initialize function's tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8. + * @param[in] input_to_input_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_output_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_input_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_cell_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_output_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_gate_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] forget_gate_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] cell_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] output_gate_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] cell_state_in 2D tensor with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[in] output_state_in 2D tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input. + * @param[out] cell_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, + const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, const ICLTensor *output_state_in, + ICLTensor *cell_state_out, ICLTensor *output_state_out); /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayerQuantized * diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h index a407e981da..49a87baaf2 100644 --- a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h +++ b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -68,6 +68,19 @@ public: * */ void configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: U8. + * @param[out] pyramid Destination pyramid tensors, Data types supported at each level: S16. + * @param[out] output The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data types supported: S16. + * The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is: + * output.width = input.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1) + * @param[in] border_mode Border mode to use. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h index 3407f46887..2c7afde7de 100644 --- a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h +++ b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -77,6 +77,22 @@ public: * */ void configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value); + /** Initialise the function's source, destinations and border mode. + * + * The Output image must have the same size as the first level of the pyramid. + * The Input image must have the same size as the last level of the pyramid. + * + * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid. + * + * @param[in] compile_context The compile context to be used. + * @param[in] pyramid Laplacian pyramid tensors, Data types supported at each level: S16. + * @param[in] input Source tensor. Data types supported: S16. + * @param[out] output Output tensor. Data types supported: U8. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * + */ + void configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h index 1186a449d5..7a43eab478 100644 --- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -73,6 +73,19 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); /** Static function to check if given info will lead to a valid configuration of @ref CLLocallyConnectedLayer * * @param[in] input Input tensor info. 3 lower dimensions represent a single input [width, height, IFM], diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h index 2f5932b5ab..e52ab240e4 100644 --- a/arm_compute/runtime/CL/functions/CLMagnitude.h +++ b/arm_compute/runtime/CL/functions/CLMagnitude.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,6 +43,15 @@ public: * @param[in] mag_type (Optional) Magnitude calculation type. Default: L2NORM. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM); + /** Initialise the kernel's inputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 First tensor input. Data types supported: S16. + * @param[in] input2 Second tensor input. Data types supported: S16. + * @param[out] output Output tensor. Data types supported: S16. + * @param[in] mag_type (Optional) Magnitude calculation type. Default: L2NORM. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM); }; } #endif /*ARM_COMPUTE_CLMAGNITUDE_H */ diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h index fea1ed194f..561ac04f1d 100644 --- a/arm_compute/runtime/CL/functions/CLMeanStdDev.h +++ b/arm_compute/runtime/CL/functions/CLMeanStdDev.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -57,6 +57,14 @@ public: * @param[out] stddev (Optional) Output standard deviation of pixel values. */ void configure(ICLImage *input, float *mean, float *stddev = nullptr); + /** Initialise the kernel's inputs and outputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Input image. Data types supported: U8/F16/F32. (Written to only for border filling) + * @param[out] mean Output average pixel value. + * @param[out] stddev (Optional) Output standard deviation of pixel values. + */ + void configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDev * * @param[in] input Input image. Data types supported: U8/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h index 565f8f3040..e39a5908b8 100644 --- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,6 +44,16 @@ public: * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8. */ void configure(ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f); + /** Initialise the function's input and outputs. + * + * @note If the output tensor is a nullptr, the normalization will be performed in-place. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Input tensor with 2 dimensions. Data types supported: F16/F32. + * @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input + * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f); /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel * * @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr, diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h index 3a9a95a5f3..f3bb2832ef 100644 --- a/arm_compute/runtime/CL/functions/CLMedian3x3.h +++ b/arm_compute/runtime/CL/functions/CLMedian3x3.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,15 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLMEDIAN3X3_H */ diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h index 30a29f2b8c..e9e3bd910c 100644 --- a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h +++ b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -66,6 +66,22 @@ public: void configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr, uint32_t *min_count = nullptr, uint32_t *max_count = nullptr); + /** Initialise the kernel's inputs and outputs. + * + * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input image. Data types supported: U8/S16/F32. + * @param[out] min Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32. + * @param[out] max Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32. + * @param[out] min_loc (Optional) Array of Coordinates2D used to store minimum value locations. + * @param[out] max_loc (Optional) Array of Coordinates2D used to store maximum value locations. + * @param[out] min_count (Optional) Number of minimum value encounters. + * @param[out] max_count (Optional) Number of maximum value encounters. + */ + void configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, + CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr, + uint32_t *min_count = nullptr, uint32_t *max_count = nullptr); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h index a7c87d35b0..79f73ea370 100644 --- a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h +++ b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -56,6 +56,20 @@ public: */ void configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialize the function's source, destination, conv and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor. Data types supported: U8 + * @param[in] function Non linear function to perform + * @param[in] mask_size Mask size. Supported sizes: 3, 5 + * @param[in] pattern Mask pattern + * @param[in] mask The given mask. Will be used only if pattern is specified to PATTERN_OTHER + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, + BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLNONLINEARFILTER_H */ diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h index 0859a09bdb..e2c0c4f814 100644 --- a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h +++ b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,18 @@ public: * The implementation supports just 2 border modes: UNDEFINED and CONSTANT */ void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode); + /** Initialise the function's source, destinations and border mode. + * + * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT + * The constant values used with CONSTANT border mode is 0 + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8, F32. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination for the Non-Maxima suppressions 3x3. Data types supported: same as @p input. + * @param[in] border_mode Border mode to use for non-maxima suppression. + * The implementation supports just 2 border modes: UNDEFINED and CONSTANT + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode); }; } #endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H */ diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h index d06bf56794..07bb62c7d7 100644 --- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -58,6 +58,17 @@ public: * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler). + * Data layouts supported: NCHW/NHWC. + * @param[out] output Destination tensor. Dimensions, data type and number of channels must match the input ones. + * Data types supported: same as @p input. Data layouts supported: same as @p input. + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h index 5fbfdd18b7..5dd3760d3d 100644 --- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h +++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h @@ -50,6 +50,17 @@ public: * Data types supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels]. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] output Destinationfeature tensor. Data type supported: same as @p input + * @param[in] mean Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: Same as @p input + * @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels. + * Data types supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayer * * @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels]. diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h index 33df175287..12d0583384 100644 --- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h +++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -91,6 +91,27 @@ public: const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function input and output + * + * @param[in] compile_context The compile context to be used. + * @param[in] old_pyramid Pointer to the pyramid for the old tensor. Data types supported U8 + * @param[in] new_pyramid Pointer to the pyramid for the new tensor. Data types supported U8 + * @param[in] old_points Pointer to the IKeyPointArray storing old key points + * @param[in] new_points_estimates Pointer to the IKeyPointArray storing new estimates key points + * @param[out] new_points Pointer to the IKeyPointArray storing new key points + * @param[in] termination The criteria to terminate the search of each keypoint. + * @param[in] epsilon The error for terminating the algorithm + * @param[in] num_iterations The maximum number of iterations before terminate the alogrithm + * @param[in] window_dimension The size of the window on which to perform the algorithm + * @param[in] use_initial_estimate The flag to indicate whether the initial estimated position should be used + * @param[in] border_mode The border mode applied at scharr kernel stage + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT + * + */ + void configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid, + const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, + Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, + BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h index 7f8a41238c..74fa86a320 100644 --- a/arm_compute/runtime/CL/functions/CLPReluLayer.h +++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h @@ -47,6 +47,16 @@ public: * @param[out] output Destination tensor. Data type supported: same as @p input */ void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLPReluLayer * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h index f020d68c92..82d7205381 100644 --- a/arm_compute/runtime/CL/functions/CLPadLayer.h +++ b/arm_compute/runtime/CL/functions/CLPadLayer.h @@ -63,6 +63,19 @@ public: * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair padding[i] + * specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, + * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayer. * diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h index b1705cf4c5..37e651cfbb 100644 --- a/arm_compute/runtime/CL/functions/CLPermute.h +++ b/arm_compute/runtime/CL/functions/CLPermute.h @@ -46,6 +46,16 @@ public: * @param[in] perm Permutation vector */ void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); + /** Set the input and output tensors. + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] input The input tensor to permute. Data types supported: All. + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration of @ref CLPermute. * * @note Arbitrary permutation vectors are supported with rank not greater than 4 diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h index ac8a8670fc..f993906fe2 100644 --- a/arm_compute/runtime/CL/functions/CLPhase.h +++ b/arm_compute/runtime/CL/functions/CLPhase.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,6 +43,15 @@ public: * @param[in] phase_type (Optional) Phase calculation type. Default: SIGNED. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED); + /** Initialise the kernel's inputs, output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 First tensor input. Data types supported: S16. + * @param[in] input2 Second tensor input. Data types supported: S16. + * @param[out] output Output tensor. Data types supported: U8. + * @param[in] phase_type (Optional) Phase calculation type. Default: SIGNED. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED); }; } #endif /*ARM_COMPUTE_CLPHASE_H */ diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h index 47bb2bf4db..8b0ee70f12 100644 --- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h +++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h @@ -59,6 +59,22 @@ public: */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication * * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. @@ -98,6 +114,17 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Initialise the kernel's inputs, output. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplication * * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2. diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h index 05b35dcee8..7d646ab268 100644 --- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h +++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h @@ -49,6 +49,15 @@ public: * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h index eea1399552..d39e4112f9 100644 --- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h +++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -46,6 +46,15 @@ public: * @param[in] info Prior box layer info. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC. + * @param[in] input2 Second source tensor. Data types and layouts supported: same as @p input1 + * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1 + * @param[in] info Prior box layer info. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayer * * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC. diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h index ab34135ff5..72a61f8505 100644 --- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h @@ -113,6 +113,55 @@ public: ICLTensor *cell_state_out, ICLTensor *output_state_out, const LSTMParams &lstm_params); + /** Initialize function's tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED. + * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8. + * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8. + * @param[in] input_to_output_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8. + * @param[in] recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8. + * @param[in] recurrent_to_cell_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8. + * @param[in] recurrent_to_output_weights 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8. + * @param[in] forget_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32. + * @param[in] cell_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32. + * @param[in] output_gate_bias 1D weights tensor with dimensions [num_units]. Data type supported: S32. + * @param[in] cell_state_in 2D tensor with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[in] output_state_in 2D tensor with dimensions [num_units, batch_size]. Data type supported: Same as @p input. + * @param[out] cell_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [num_units, batch_size].Data types supported: Same as @p input. + * @param[in] lstm_params Weights tensors used in peephole, CIFG and layer normalization optimizations: + * input_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at input gate. + * forget_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at forget gate. + * cell_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at cell gate. + * output_intermediate_scale Scale of the intermediate result of matmul, i.e. input to layer normalization, at output gate. + * hidden_state_zero The zero point of the hidden state. + * hidden_state_scale The scale of the hidden state. + * input_to_input_weights (Optional) 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8. + * recurrent_to_input_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8. + * cell_to_input_weights (Optional) 1D weights tensor with dimensions [num_units]. Can be nullptr. Data type supported: QSYMM16. + * cell_to_forget_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16. + * cell_to_output_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16. + * input_gate_bias (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: S32. + * projection_weights (Optional) 2D weights tensor with dimensions [output_size, num_units]. Data type supported: QSYMM8. + * projection_bias (Optional) 1D weights tensor with dimensions [output_size]. S32. + * input_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16. + * forget_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16. + * cell_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16. + * output_layer_norm_weights (Optional) 1D weights tensor with dimensions [num_units]. Data type supported: QSYMM16. + * cell_threshold (Optional) The clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip]. + * If set to 0.0 then clipping is disabled. + * projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within + * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, + const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + const ICLTensor *cell_state_in, const ICLTensor *output_state_in, + ICLTensor *cell_state_out, ICLTensor *output_state_out, + const LSTMParams &lstm_params); + /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer * * @param[in] input Source tensor info. Input is a 2D tensor info with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED. @@ -169,19 +218,20 @@ public: private: /** Internal method to configure matrix multiplication plus output stage of each gate. * - * @param[in] mm Matrix multiplication function to use. - * @param[in] outstage Output stage function to use. - * @param[in] gemmlowp_info GEMMLowp metadata to be used by the output stage. - * @param[in] mm_input Input tensor to matrix multiplication function. - * @param[in] mm_weights Weights tensor to matrix multiplication function. - * @param[in] bias Bias tensor to matrix multiplication function. - * @param[in] outstage_res Tensor to be used for storing the result of the output stage. - * @param[in] gemmlowp_scale Real multiplier to be used computing multiplier and shift for requantization. - * @param[in] mm_res_info Tensor info to be used to initialize matrix multiplication result tensor. - * @param[in] mm_res_info Tensor info to be used to initialize output stage result tensor. + * @param[in] compile_context The compile context to be used. + * @param[in] mm Matrix multiplication function to use. + * @param[in] outstage Output stage function to use. + * @param[in] gemmlowp_info GEMMLowp metadata to be used by the output stage. + * @param[in] mm_input Input tensor to matrix multiplication function. + * @param[in] mm_weights Weights tensor to matrix multiplication function. + * @param[in] bias Bias tensor to matrix multiplication function. + * @param[in] outstage_res Tensor to be used for storing the result of the output stage. + * @param[in] gemmlowp_scale Real multiplier to be used computing multiplier and shift for requantization. + * @param[in] mm_res_info Tensor info to be used to initialize matrix multiplication result tensor. + * @param[in] mm_res_info Tensor info to be used to initialize output stage result tensor. * */ - void configure_mm(CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, + void configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info); diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h index fbdef53aeb..f59e3b7919 100644 --- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h @@ -48,6 +48,15 @@ public: * @note Output auto initialization is not supported by this function */ void configure(const ICLTensor *input, ICLTensor *output); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32. + * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. + * + * @note Output auto initialization is not supported by this function + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayer * * @param[in] input Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32. diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h index 569e3da89e..0291eb17a9 100644 --- a/arm_compute/runtime/CL/functions/CLRNNLayer.h +++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -52,6 +52,19 @@ public: * @param[in] info Activation layer parameter. */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p input + * @param[out] output Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input + * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input + * @param[in] info Activation layer parameter. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info); /** Initialize the function * * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32 diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h index 7c2c6eb26f..b6defe6c7f 100644 --- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h +++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h @@ -56,6 +56,22 @@ public: * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner + * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. + * Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo. + * + * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled + * width and pooled height. + * @note The z dimensions of @p output tensor and @p input tensor must be the same. + * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayer * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h index 7d0e1da4f8..0376e7847c 100644 --- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h +++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -56,6 +56,21 @@ public: * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: F16/F32. + * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner + * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16 + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo. + * + * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled + * width and pooled height. + * @note The z dimensions of @p output tensor and @p input tensor must be the same. + * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); }; } #endif /* ARM_COMPUTE_CLROIPOOLINGLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h index 2cc8376b72..19e11bacd4 100644 --- a/arm_compute/runtime/CL/functions/CLRange.h +++ b/arm_compute/runtime/CL/functions/CLRange.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -47,6 +47,15 @@ public: * @param[in] step The gap between each pair of values in the sequence. Default is 1. */ void configure(ICLTensor *output, float start, float end, float step = 1.f); + /** Initialize the kernel's start, end, step and output tensor. + * + * @param[in] compile_context The compile context to be used. + * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. + * @param[in] start The starting value of the sequence. + * @param[in] end The ending (not including) value of the sequence. + * @param[in] step The gap between each pair of values in the sequence. Default is 1. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f); /** Static function to check if given info will lead to a valid configuration of @ref CLRange * * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h index 30000edd62..57ec48d690 100644 --- a/arm_compute/runtime/CL/functions/CLReduceMean.h +++ b/arm_compute/runtime/CL/functions/CLReduceMean.h @@ -51,6 +51,17 @@ public: * @param[out] output Destination tensor. Data type supported: Same as @p input */ void configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLReduceMean * diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h index 254c7309fd..25cf655802 100644 --- a/arm_compute/runtime/CL/functions/CLReductionOperation.h +++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h @@ -61,6 +61,16 @@ public: * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true. */ void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3 + * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX + * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true); /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation. * diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h index f035ac902c..dc8a2c4ecf 100644 --- a/arm_compute/runtime/CL/functions/CLRemap.h +++ b/arm_compute/runtime/CL/functions/CLRemap.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,20 @@ public: */ void configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's sources, destination, interpolation policy and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[in] map_x Map for X coords. Data types supported: F32. + * @param[in] map_y Map for Y coords. Data types supported: F32. + * @param[out] output Output tensor. Data types supported: U8. + * @param[in] policy Interpolation policy to use. Only NEAREST and BILINEAR are supported. + * @param[in] border_mode Border mode to use on the input tensor. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, + InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLREMAP_H */ diff --git a/arm_compute/runtime/CL/functions/CLReorgLayer.h b/arm_compute/runtime/CL/functions/CLReorgLayer.h index dd08c0f1fc..8b245ab441 100644 --- a/arm_compute/runtime/CL/functions/CLReorgLayer.h +++ b/arm_compute/runtime/CL/functions/CLReorgLayer.h @@ -45,6 +45,18 @@ public: * */ void configure(ICLTensor *input, ICLTensor *output, int32_t stride); + /** Initialise the function's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Destination tensor with tensor shape: + * [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has + * the same number of input elements. Data types supported: same as @p input. + * @param[in] stride Stride value to use for reorganizing the values in the output tensor. + * It defines the spatial distance between 2 consecutive pixels in the x and y direction + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride); /** Static function to check if given info will lead to a valid configuration of @ref CLReorgLayer * * @param[in] input Source tensor. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h index 63fe5457a3..e91c2c739b 100644 --- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h +++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h @@ -40,6 +40,13 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] input First tensor input. Data type supported: All + * @param[out] output Output tensor. Data type supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayer * diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h index f87bd19a90..87ae34c89d 100644 --- a/arm_compute/runtime/CL/functions/CLReverse.h +++ b/arm_compute/runtime/CL/functions/CLReverse.h @@ -41,6 +41,14 @@ public: * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32 */ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32 + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis); /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel * * @param[in] input Input tensor info. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h index c06c9b629a..f345995138 100644 --- a/arm_compute/runtime/CL/functions/CLScale.h +++ b/arm_compute/runtime/CL/functions/CLScale.h @@ -51,6 +51,21 @@ public: */ void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(), SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false); + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor. Data types supported: Same as @p input + * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] policy The interpolation type. + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER + * @param[in] use_padding (Optional) Is padding in use or not. Defaults to true. + * @param[in] align_corners (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(), + SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false); /** Static function to check if given info will lead to a valid configuration of @ref CLScale * diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h index 708243260c..b25b548eaa 100644 --- a/arm_compute/runtime/CL/functions/CLScharr3x3.h +++ b/arm_compute/runtime/CL/functions/CLScharr3x3.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -53,6 +53,18 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @note At least one of output_x or output_y must be not NULL. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output_x (optional) Destination for the Scharr 3x3 convolution along the X axis. Data types supported: S16. + * @param[out] output_y (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data types supported: S16. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLSCHARR3X3_H */ diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h index a7e06e4eec..84d0997149 100644 --- a/arm_compute/runtime/CL/functions/CLSelect.h +++ b/arm_compute/runtime/CL/functions/CLSelect.h @@ -45,6 +45,15 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p x. */ void configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output); + /** Initialise the kernel's inputs and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] c Condition input tensor. Data types supported: U8. + * @param[in] x First input tensor. Data types supported: All. + * @param[in] y Second input tensor. Data types supported: Same as @p x + * @param[out] output Output tensor. Data types supported: Same as @p x. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSelect * * @param[in] c Condition input tensor. Data types supported: U8. diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h index f5fca43874..a8c6e1ff0b 100644 --- a/arm_compute/runtime/CL/functions/CLSlice.h +++ b/arm_compute/runtime/CL/functions/CLSlice.h @@ -48,6 +48,20 @@ public: * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). */ void configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * @note Start indices must be non-negative. 0 <= starts[i] + * @note End coordinates can be negative, which represents the number of elements before the end of that dimension. + * @note End indices are not inclusive unless negative. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data type supported: All. + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). + * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends); /** Static function to check if given info will lead to a valid configuration of @ref CLSlice * diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h index 2f4cf50465..24bc0cda43 100644 --- a/arm_compute/runtime/CL/functions/CLSobel3x3.h +++ b/arm_compute/runtime/CL/functions/CLSobel3x3.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -53,6 +53,18 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @note At least one of output_x or output_y must be not NULL. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output_x (optional) Destination for the Sobel 3x3 convolution along the X axis. Data types supported: S16. + * @param[out] output_y (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data types supported: S16. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLSOBEL3X3_H */ diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h index 2a9136b92e..bf266270c3 100644 --- a/arm_compute/runtime/CL/functions/CLSobel5x5.h +++ b/arm_compute/runtime/CL/functions/CLSobel5x5.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -65,6 +65,18 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @note At least one of output_x or output_y must be not NULL. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output_x (optional) Destination for the Sobel 5x5 convolution along the X axis. Data types supported: S16. + * @param[out] output_y (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data types supported: S16. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h index e3188b85f5..13932c704a 100644 --- a/arm_compute/runtime/CL/functions/CLSobel7x7.h +++ b/arm_compute/runtime/CL/functions/CLSobel7x7.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -65,6 +65,18 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialise the function's source, destinations and border mode. + * + * @note At least one of output_x or output_y must be not NULL. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output_x (optional) Destination for the Sobel 7x7 convolution along the X axis. Data types supported: S32. + * @param[out] output_y (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data types supported: S32. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h index 751b68d0cf..fadbc430e6 100644 --- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h +++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -67,6 +67,17 @@ public: * when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image. */ void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 1); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32 + * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. Defaults to 1.f + * @param[in] axis (Optional) Reduction axis. It has the purpose of squashing the first @p axis + * dimensions together. For instance, given a [4x4x4x4] image, + * when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer * * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32 @@ -97,6 +108,21 @@ private: * when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image. */ void configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis); + /** Utility method to configure the kernels needed to flatten the input + * tensor. + * + * @note This function changes the internal state of this class. In particular, + * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and + * @p _output_flat + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Original source tensor. + * @param[in] output Original destination tensor. + * @param[in] axis (Optional) Reduction axis. It has the purpose of squashing the first @p axis + * dimensions together. For instance, given a [4x4x4x4] image, + * when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image. + */ + void configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis); MemoryGroup _memory_group; CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel; diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h index ef9f164112..b8e2bdc4c6 100644 --- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h +++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h @@ -63,6 +63,15 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input */ void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. + * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 + * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32 + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output); /** Set the input and output tensors. (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -73,6 +82,18 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input */ void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output); + /** Set the input and output tensors. (Static block shape and paddings) + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. + * @param[in] block_shape_x Block shape x value. + * @param[in] block_shape_y Block shape y value. + * @param[in] padding_left The left padding of the output tensor. + * @param[in] padding_right The right padding of the output tensor. + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h index be7937d0e6..ac011dd998 100644 --- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h +++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h @@ -46,6 +46,14 @@ public: * @param[in] block_shape Block shape value. */ void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape); + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayer. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h index ebce4f34d0..9b204458c3 100644 --- a/arm_compute/runtime/CL/functions/CLStackLayer.h +++ b/arm_compute/runtime/CL/functions/CLStackLayer.h @@ -56,6 +56,17 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. */ void configure(const std::vector &input, int axis, ICLTensor *output); + /** Initialise the kernel's inputs vector and output. + * + * @note Supported input tensor rank: up to 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] input The vectors containing all the tensors with the same shape to stack. Data types supported: All. + * @param[in] axis The dimension to stack the tensors along. It must be smaller than the number of input dimensions. + * Negative values wrap around + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(const CLCompileContext &compile_context, const std::vector &input, int axis, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel * * @note Supported input tensor rank: up to 4 diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h index 6bde2c0af4..bb2bc962d6 100644 --- a/arm_compute/runtime/CL/functions/CLStridedSlice.h +++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h @@ -52,6 +52,24 @@ public: void configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data type supported: All. + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). + * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). + * @param[in] strides The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input). + * @param[in] begin_mask (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead. + * @param[in] end_mask (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead. + * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. + * A slice of size 1 starting from starts[i] in the dimension must be preserved. + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, + const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, + int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice * diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h index c1b7b943a0..1c11f076a3 100644 --- a/arm_compute/runtime/CL/functions/CLTableLookup.h +++ b/arm_compute/runtime/CL/functions/CLTableLookup.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,6 +42,14 @@ public: * @param[out] output Output tensor. Data types supported: U8 and S16 */ void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output); + /** Initialise the kernel's inputs and output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input First tensor input. Data types supported: U8 and S16 + * @param[in] lut Input lookup table. Data types supported: U8 and S16 + * @param[out] output Output tensor. Data types supported: U8 and S16 + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output); }; } #endif /*ARM_COMPUTE_CLTABLELOOKUP_H */ diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h index a19b320b3e..d8ae6fbb34 100644 --- a/arm_compute/runtime/CL/functions/CLThreshold.h +++ b/arm_compute/runtime/CL/functions/CLThreshold.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,20 @@ public: void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value = 0, uint8_t true_value = 0, ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0); + /** Initialise the function's source, destination, thresholds and threshold type + * + * @param[in] compile_context The compile context to be used. + * @param[in] input First tensor input. Data types supported: U8. + * @param[out] output Output tensor. Data types supported: U8. + * @param[in] threshold Threshold. If upper threshold is specified, this will be used as the lower threshold. + * @param[in] false_value Value to assign when the condition is false. + * @param[in] true_value value to assign when the condition is true. + * @param[in] type Thresholding type. Can either be BINARY or RANGE. + * @param[in] upper Upper threshold. Only used with RANGE thresholding + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold, + uint8_t false_value = 0, uint8_t true_value = 0, + ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0); }; } #endif /*ARM_COMPUTE_CLTHRESHOLD_H */ diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h index 9c83b0cace..0dad9ad89d 100644 --- a/arm_compute/runtime/CL/functions/CLTile.h +++ b/arm_compute/runtime/CL/functions/CLTile.h @@ -43,6 +43,14 @@ public: * @param[out] output Destination tensor. Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples); + /** Set the source, destination of the kernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data type supported: All. + * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension. + * @param[out] output Destination tensor. Same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples); /** Static function to check if given info will lead to a valid configuration of @ref CLTile * * @param[in] input Source tensor info. Data type supported: All. diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h index 61092a1914..b2fdcda5c4 100644 --- a/arm_compute/runtime/CL/functions/CLTranspose.h +++ b/arm_compute/runtime/CL/functions/CLTranspose.h @@ -44,6 +44,13 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Initialise the kernel's inputs and output + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLTranspose * * @param[in] input The input tensor. Data types supported: All. diff --git a/arm_compute/runtime/CL/functions/CLUnstack.h b/arm_compute/runtime/CL/functions/CLUnstack.h index 814d07384c..777da692be 100644 --- a/arm_compute/runtime/CL/functions/CLUnstack.h +++ b/arm_compute/runtime/CL/functions/CLUnstack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,6 +55,16 @@ public: * */ void configure(const ICLTensor *input, const std::vector &output_vector, int axis); + /** Set the input, output and unstacking axis. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. + * @param[in,out] output_vector A vector of tensors. Data types supported: Same as @p input. + * Note: The number of elements of the vector will be used as the number of slices to be taken from the axis. + * @param[in] axis The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around. + * + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector &output_vector, int axis); /** Static function to check if given info will lead to a valid configuration of @ref CLUnstack * * @param[in] input Input tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 diff --git a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h index 1695fd7d2b..5f4f57f824 100644 --- a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h +++ b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h @@ -60,6 +60,16 @@ public: */ void configure(ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy); + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input. + * @param[in] info Contains stride information described in @ref Size2D. + * @param[in] upsampling_policy Defines the policy to fill the intermediate pixels. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, + const Size2D &info, const InterpolationPolicy upsampling_policy); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h index 2de7107f13..1a2fe9d4d5 100644 --- a/arm_compute/runtime/CL/functions/CLWarpAffine.h +++ b/arm_compute/runtime/CL/functions/CLWarpAffine.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -48,6 +48,19 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialize the function's source, destination, interpolation policy and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source temspr. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor, Data types supported: U8. + * @param[in] matrix The affine matrix. Must be 2x3 of type float. + * The matrix argument requires 9 values, the last 3 values are ignored. + * @param[in] policy The interpolation type. + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, + uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLWARPAFFINE_H */ diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h index 93fcc85a95..5db9ec4cf0 100644 --- a/arm_compute/runtime/CL/functions/CLWarpPerspective.h +++ b/arm_compute/runtime/CL/functions/CLWarpPerspective.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -47,6 +47,18 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ void configure(ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); + /** Initialize the function's source, destination, interpolation policy and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) + * @param[out] output Destination tensor. Data types supported: U8. + * @param[in] matrix The perspective matrix. Must be 3x3 of type float. + * @param[in] policy The interpolation type. + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, + uint8_t constant_border_value = 0); }; } #endif /*ARM_COMPUTE_CLWARPPERSPECTIVE_H */ diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h index 7ac59c900c..c1de5f15ce 100644 --- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -75,6 +75,26 @@ public: */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + /** Set the input and output tensors. + * + * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout + * @note Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer * * @note: This function only works with 3x3,3x1,1x3,5x5,5x1 and 1x5 kernels along with unit strides for both NCHW and NHWC data layout diff --git a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h index e1ab928cf2..11a402e51d 100644 --- a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h +++ b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,6 +55,25 @@ public: * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo. */ void configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info); + /** Set the input and output tensors. + * + * @note Winograd input transform supports the following configurations for NCWH data layout + * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3), + * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * @note Winograd input transform supports the following configurations for NHWC data layout + * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * Strides: only unit strides + * + * @param[in] compile_context The compile context to be used. + * @param[in] input The input tensor to transform. Data types supported: F16,F32 + * @param[in] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input + * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info); /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransform. * * @note Winograd input transform supports the following configurations for NCWH data layout diff --git a/arm_compute/runtime/CL/functions/CLYOLOLayer.h b/arm_compute/runtime/CL/functions/CLYOLOLayer.h index 95c684b2c3..e70d84b97e 100644 --- a/arm_compute/runtime/CL/functions/CLYOLOLayer.h +++ b/arm_compute/runtime/CL/functions/CLYOLOLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -53,6 +53,18 @@ public: * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels) */ void configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes); + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] act_info Activation layer parameters. + * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels) + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes); /** Static function to check if given info will lead to a valid configuration of @ref CLYOLOLayer * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp index cf77c807cf..492c54e4d3 100644 --- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp +++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); +} + +void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output); + k->configure(compile_context, input1, input2, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp index d529d9402a..a81d1d042b 100644 --- a/src/runtime/CL/functions/CLAccumulate.cpp +++ b/src/runtime/CL/functions/CLAccumulate.cpp @@ -31,22 +31,37 @@ using namespace arm_compute; void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, accum); +} + +void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, accum); + k->configure(compile_context, input, accum); _kernel = std::move(k); } void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum); +} + +void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, alpha, accum); + k->configure(compile_context, input, alpha, accum); _kernel = std::move(k); } void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum); +} + +void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, shift, accum); + k->configure(compile_context, input, shift, accum); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index 9882145741..989603a9df 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -36,9 +36,14 @@ CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) } void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info); +} + +void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, act_info); + k->configure(compile_context, input, output, act_info); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp index 4ac6d25d75..5b4c694f33 100644 --- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp +++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -105,6 +105,11 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen } void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op); +} + +void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); @@ -121,7 +126,7 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou // Create temporary tensors if(_num_of_stages == 1) { - _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op); + _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op); } else { @@ -135,22 +140,22 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou // Apply ReductionOperation only on first kernel _memory_group.manage(&_results_vector[0]); - _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op); + _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op); // Apply ReductionOperation on intermediate stages for(unsigned int i = 1; i < _num_of_stages - 1; ++i) { _memory_group.manage(&_results_vector[i]); - _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i], axis, op); + _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op); _results_vector[i - 1].allocator()->allocate(); } // Apply ReductionOperation on the last stage const unsigned int last_stage = _num_of_stages - 1; - _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op); + _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op); _results_vector[last_stage - 1].allocator()->allocate(); } - _reshape_kernel.configure(&_not_reshaped_output, output); + _reshape_kernel.configure(compile_context, &_not_reshaped_output, output); _not_reshaped_output.allocator()->allocate(); } diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index f87ea6ea06..9fc51136b3 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,7 +40,14 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer() void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon, ActivationLayerInfo act_info) { - _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info); + configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); +} + +void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, + const ICLTensor *gamma, float epsilon, + ActivationLayerInfo act_info) +{ + _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); } Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp index 7919b131c8..0a2ae2a6e0 100644 --- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,12 +39,22 @@ CLBatchToSpaceLayer::CLBatchToSpaceLayer() void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) { - _batch_to_space_kernel.configure(input, block_shape, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); +} + +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +{ + _batch_to_space_kernel.configure(compile_context, input, block_shape, output); } void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) { - _batch_to_space_kernel.configure(input, block_shape_x, block_shape_y, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output); +} + +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) +{ + _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output); } Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp index 55ee78cd28..1fa80f0a24 100644 --- a/src/runtime/CL/functions/CLBitwiseAnd.cpp +++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); +} + +void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output); + k->configure(compile_context, input1, input2, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp index f20363169b..46595191a0 100644 --- a/src/runtime/CL/functions/CLBitwiseNot.cpp +++ b/src/runtime/CL/functions/CLBitwiseNot.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp index f1d07df1e7..8431140cb8 100644 --- a/src/runtime/CL/functions/CLBitwiseOr.cpp +++ b/src/runtime/CL/functions/CLBitwiseOr.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); +} + +void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output); + k->configure(compile_context, input1, input2, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp index e5d3c7a292..0e0e7f2028 100644 --- a/src/runtime/CL/functions/CLBitwiseXor.cpp +++ b/src/runtime/CL/functions/CLBitwiseXor.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); +} + +void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output); + k->configure(compile_context, input1, input2, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp index 8669da636e..55bcde749c 100644 --- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp +++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp @@ -29,10 +29,15 @@ namespace arm_compute { void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +{ + configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); +} + +void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) { // Configure Bounding Box kernel auto k = arm_compute::support::cpp14::make_unique(); - k->configure(boxes, pred_boxes, deltas, info); + k->configure(compile_context, boxes, pred_boxes, deltas, info); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp index d288e42d9b..72c822197c 100644 --- a/src/runtime/CL/functions/CLBox3x3.cpp +++ b/src/runtime/CL/functions/CLBox3x3.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); +} + +void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp index 28abaa284b..0c8d3532aa 100644 --- a/src/runtime/CL/functions/CLCannyEdge.cpp +++ b/src/runtime/CL/functions/CLCannyEdge.cpp @@ -57,6 +57,13 @@ CLCannyEdge::CLCannyEdge(std::shared_ptr memory_manager) // NOLI void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value); +} + +void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, + BorderMode border_mode, + uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); @@ -111,19 +118,19 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t if(gradient_size == 3) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); + k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); _sobel = std::move(k); } else if(gradient_size == 5) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); + k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); _sobel = std::move(k); } else if(gradient_size == 7) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); + k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); _sobel = std::move(k); } else @@ -136,7 +143,7 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t _memory_group.manage(&_phase); // Configure gradient - _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type); + _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type); // Allocate intermediate buffers _gx.allocator()->allocate(); @@ -146,14 +153,14 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t _memory_group.manage(&_nonmax); // Configure non-maxima suppression - _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED); + _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED); // Allocate intermediate buffers _phase.allocator()->allocate(); // Fill border around magnitude image as non-maxima suppression will access // it. If border mode is undefined filling the border is a nop. - _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value); + _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value); // Allocate intermediate buffers _mag.allocator()->allocate(); @@ -165,7 +172,7 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t _memory_group.manage(&_l1_list_counter); // Configure edge tracing - _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter); + _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter); // Allocate intermediate buffers _visited.allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp index e3813b3008..7048a79bc5 100644 --- a/src/runtime/CL/functions/CLCast.cpp +++ b/src/runtime/CL/functions/CLCast.cpp @@ -31,9 +31,14 @@ namespace arm_compute { void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, policy); +} + +void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, policy, 0); + k->configure(compile_context, input, output, policy, 0); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp index c7f2748b0b..249212e03b 100644 --- a/src/runtime/CL/functions/CLChannelCombine.cpp +++ b/src/runtime/CL/functions/CLChannelCombine.cpp @@ -31,15 +31,25 @@ using namespace arm_compute; void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output); +} + +void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(plane0, plane1, plane2, plane3, output); + k->configure(compile_context, plane0, plane1, plane2, plane3, output); _kernel = std::move(k); } void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output); +} + +void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(plane0, plane1, plane2, output); + k->configure(compile_context, plane0, plane1, plane2, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp index 810103994d..019e0a7a90 100644 --- a/src/runtime/CL/functions/CLChannelExtract.cpp +++ b/src/runtime/CL/functions/CLChannelExtract.cpp @@ -31,15 +31,25 @@ using namespace arm_compute; void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); +} + +void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, channel, output); + k->configure(compile_context, input, channel, output); _kernel = std::move(k); } void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); +} + +void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, channel, output); + k->configure(compile_context, input, channel, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp index 2e1192a725..93ab7c7ddf 100644 --- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp +++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp @@ -30,9 +30,14 @@ namespace arm_compute { void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); +} + +void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, num_groups); + k->configure(compile_context, input, output, num_groups); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp index 49aacea788..b8e597751b 100644 --- a/src/runtime/CL/functions/CLColorConvert.cpp +++ b/src/runtime/CL/functions/CLColorConvert.cpp @@ -31,29 +31,49 @@ using namespace arm_compute; void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp index ac56d5782c..8d5ec3571d 100644 --- a/src/runtime/CL/functions/CLComparison.cpp +++ b/src/runtime/CL/functions/CLComparison.cpp @@ -31,9 +31,14 @@ namespace arm_compute { void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); +} + +void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output, operation); + k->configure(compile_context, input1, input2, output, operation); _kernel = std::move(k); if(output->info()->dimension(0) > 1) @@ -42,7 +47,7 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou if(broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } @@ -54,9 +59,15 @@ Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *inpu template void CLComparisonStatic::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); +} + +template +void CLComparisonStatic::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output, COP); + k->configure(compile_context, input1, input2, output, COP); _kernel = std::move(k); if(output->info()->dimension(0) > 1) @@ -65,7 +76,7 @@ void CLComparisonStatic::configure(ICLTensor *input1, ICLTensor *input2, IC if(broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp index b71e89c804..62714fed5c 100644 --- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp +++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp @@ -28,10 +28,15 @@ namespace arm_compute { void CLComputeAllAnchors::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +{ + configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info); +} + +void CLComputeAllAnchors::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) { // Configure ComputeAllAnchors kernel auto k = arm_compute::support::cpp14::make_unique(); - k->configure(anchors, all_anchors, info); + k->configure(compile_context, anchors, all_anchors, info); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index c85450b2f6..e97256713f 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -49,12 +49,22 @@ CLConcatenateLayer::CLConcatenateLayer() void CLConcatenateLayer::configure(std::vector &inputs_vector, ICLTensor *output, size_t axis) { - configure_internal(std::move(inputs_vector), output, axis); + configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); +} + +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis) +{ + configure_internal(compile_context, std::move(inputs_vector), output, axis); } void CLConcatenateLayer::configure(std::vector &inputs_vector, ICLTensor *output, size_t axis) { - configure_internal(std::move(inputs_vector), output, axis); + configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); +} + +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis) +{ + configure_internal(compile_context, std::move(inputs_vector), output, axis); } Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) @@ -68,7 +78,7 @@ Status CLConcatenateLayer::validate(const std::vector &inpu } template -void CLConcatenateLayer::configure_internal(std::vector &&inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_context, std::vector &&inputs_vector, ICLTensor *output, size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); _axis = axis; @@ -97,7 +107,7 @@ void CLConcatenateLayer::configure_internal(std::vector &&inputs_v { // Configure WidthConcatenate2Tensors kernel auto kernel = support::cpp14::make_unique(); - kernel->configure(inputs_vector.at(0), inputs_vector.at(1), output); + kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), output); _concat_kernels.emplace_back(std::move(kernel)); break; } @@ -105,7 +115,7 @@ void CLConcatenateLayer::configure_internal(std::vector &&inputs_v { // Configure WidthConcatenate4Tensors kernel auto kernel = support::cpp14::make_unique(); - kernel->configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output); + kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output); _concat_kernels.emplace_back(std::move(kernel)); break; } @@ -115,7 +125,7 @@ void CLConcatenateLayer::configure_internal(std::vector &&inputs_v for(unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = support::cpp14::make_unique(); - kernel->configure(inputs_vector.at(i), offset, output); + kernel->configure(compile_context, inputs_vector.at(i), offset, output); offset += inputs_vector.at(i)->info()->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } @@ -129,7 +139,7 @@ void CLConcatenateLayer::configure_internal(std::vector &&inputs_v for(unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = support::cpp14::make_unique(); - kernel->configure(inputs_vector.at(i), offset, output); + kernel->configure(compile_context, inputs_vector.at(i), offset, output); offset += inputs_vector.at(i)->info()->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } @@ -140,7 +150,7 @@ void CLConcatenateLayer::configure_internal(std::vector &&inputs_v for(unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = support::cpp14::make_unique(); - kernel->configure(inputs_vector.at(i), offset, output); + kernel->configure(compile_context, inputs_vector.at(i), offset, output); offset += inputs_vector.at(i)->info()->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } @@ -151,7 +161,7 @@ void CLConcatenateLayer::configure_internal(std::vector &&inputs_v for(unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = support::cpp14::make_unique(); - kernel->configure(inputs_vector.at(i), offset, output); + kernel->configure(compile_context, inputs_vector.at(i), offset, output); offset += inputs_vector.at(i)->info()->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp index 02927e83ad..68c0fb6ebf 100644 --- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp +++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp @@ -27,9 +27,15 @@ namespace arm_compute { void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout); +} + +void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, + DataLayout data_layout) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, original_input_shape, data_layout); + k->configure(compile_context, input, output, original_input_shape, data_layout); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp index b86a08e957..2b0d7d5e53 100644 --- a/src/runtime/CL/functions/CLConvolution.cpp +++ b/src/runtime/CL/functions/CLConvolution.cpp @@ -39,11 +39,17 @@ using namespace arm_compute; void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value); +} + +void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, + uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } template @@ -55,6 +61,13 @@ CLConvolutionSquare::CLConvolutionSquare(std::shared_ptr void CLConvolutionSquare::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value); +} + +template +void CLConvolutionSquare::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, + uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON(conv == nullptr); @@ -75,17 +88,17 @@ void CLConvolutionSquare::configure(ICLTensor *input, ICLTensor *ou scale = calculate_matrix_scale(conv, matrix_size); } - _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second); - _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); + _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second); + _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); // Allocate intermediate buffer _tmp.allocator()->allocate(); } else { - _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value)); + _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); + _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value)); } } @@ -112,9 +125,15 @@ template class arm_compute::CLConvolutionSquare<7>; template class arm_compute::CLConvolutionSquare<9>; void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value); +} + +void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, + BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index c271f502e9..b6e1413f7a 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,13 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_ma void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); +} + +void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, @@ -57,7 +64,7 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c { ARM_COMPUTE_ERROR_ON(num_groups != 1); auto f = arm_compute::support::cpp14::make_unique(_memory_manager); - f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math); + f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math); _function = std::move(f); break; } @@ -65,21 +72,21 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c { ARM_COMPUTE_ERROR_ON(num_groups != 1); auto f = arm_compute::support::cpp14::make_unique(); - f->configure(input, weights, biases, output, conv_info, act_info); + f->configure(compile_context, input, weights, biases, output, conv_info, act_info); _function = std::move(f); break; } case ConvolutionMethod::GEMM: { auto f = arm_compute::support::cpp14::make_unique(_memory_manager); - f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); + f->configure(compile_context, input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); _function = std::move(f); break; } case ConvolutionMethod::FFT: { auto f = arm_compute::support::cpp14::make_unique(_memory_manager); - f->configure(input, weights, biases, output, conv_info, act_info); + f->configure(compile_context, input, weights, biases, output, conv_info, act_info); _function = std::move(f); break; } diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp index 3692fda6b2..4c5d62a82c 100644 --- a/src/runtime/CL/functions/CLCopy.cpp +++ b/src/runtime/CL/functions/CLCopy.cpp @@ -36,9 +36,14 @@ using namespace arm_compute; void CLCopy::configure(ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp index 5e1278df5b..17fc80e146 100644 --- a/src/runtime/CL/functions/CLCropResize.cpp +++ b/src/runtime/CL/functions/CLCropResize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -21,11 +21,10 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/runtime/CL/functions/CLCropResize.h" #include "arm_compute/core/CL/CLHelpers.h" - #include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/functions/CLCropResize.h" #include @@ -51,120 +50,10 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast(abs(end[0] - start[0])) + 1, static_cast(abs(end[1] - start[1])) + 1); output->info()->set_tensor_shape(out_shape); } - -inline void run_crop(const ICLTensor *input, ICLTensor *output, uint32_t batch_index, Coordinates start, Coordinates end, float extrapolation_value) -{ - bool is_width_flipped = end[0] < start[0]; - bool is_height_flipped = end[1] < start[1]; - /** The number of rows out of bounds at the start and end of output. */ - std::array rows_out_of_bounds{ 0 }; - /** The number of columns out of bounds at the start and end of output. */ - std::array cols_out_of_bounds{ 0 }; - if(is_height_flipped) - { - rows_out_of_bounds[0] = start[1] >= static_cast(input->info()->dimension(2)) ? std::min(start[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0; - rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast(output->info()->dimension(2))) : 0; - } - else - { - rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast(output->info()->dimension(2))) : 0; - rows_out_of_bounds[1] = end[1] >= static_cast(input->info()->dimension(2)) ? std::min(end[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0; - } - if(is_width_flipped) - { - cols_out_of_bounds[0] = start[0] >= static_cast(input->info()->dimension(1)) ? std::min(start[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0; - cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast(output->info()->dimension(1))) : 0; - } - else - { - cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast(output->info()->dimension(1))) : 0; - cols_out_of_bounds[1] = end[0] >= static_cast(input->info()->dimension(1)) ? std::min(end[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0; - } - - Window full_window = calculate_max_window(*output->info()); - - // Full output window: - // -------------------------------- - // | Out of bounds | - // | rows before | - // |------------------------------| - // | Out of | In | Out of | - // | bounds | bounds | bounds | - // | cols | elements | cols | - // | before | copied | after | - // | | from input | | - // |------------------------------| - // | Out of bounds | - // | rows after | - // |------------------------------| - // Use a separate output window for each section of the full output window. - // Fill all output rows that have no elements that are within the input bounds - // with the extrapolation value using memset. - // First for the rows before the in bounds rows. - if(rows_out_of_bounds[0] > 0) - { - Window slice_fill_rows_before(full_window); - slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1)); - auto kernel = arm_compute::support::cpp14::make_unique(); - kernel->configure(output, extrapolation_value, &slice_fill_rows_before); - CLScheduler::get().enqueue(*kernel); - } - - Window slice_in(full_window); - slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], output->info()->dimension(2) - rows_out_of_bounds[1], 1)); - slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1], 1)); - - int rows_in_bounds = static_cast(output->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1]; - if(rows_in_bounds > 0) - { - // Fill all elements that share a row with an in bounds element with the extrapolation value. - if(cols_out_of_bounds[0] > 0) - { - Window slice_fill_cols_before(slice_in); - slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1)); - auto kernel = arm_compute::support::cpp14::make_unique(); - kernel->configure(output, extrapolation_value, &slice_fill_cols_before); - CLScheduler::get().enqueue(*kernel); - } - - if(cols_out_of_bounds[1] > 0) - { - Window slice_fill_cols_after(slice_in); - slice_fill_cols_after.set(1, Window::Dimension(output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1), 1)); - auto kernel = arm_compute::support::cpp14::make_unique(); - kernel->configure(output, extrapolation_value, &slice_fill_cols_after); - CLScheduler::get().enqueue(*kernel); - } - - // Copy all elements within the input bounds from the input tensor. - int cols_in_bounds = static_cast(output->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1]; - if(cols_in_bounds > 0) - { - Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], - is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] }; - Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, - is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 }; - auto kernel = arm_compute::support::cpp14::make_unique(); - - kernel->configure(input, output, start_in, end_in, batch_index, extrapolation_value, &slice_in); - CLScheduler::get().enqueue(*kernel); - } - } - - // Fill all rows after the in bounds elements with the extrapolation value. - if(rows_out_of_bounds[1] > 0) - { - Window slice_fill_rows_after(full_window); - slice_fill_rows_after.set(2, Window::Dimension(output->info()->dimension(2) - rows_out_of_bounds[1], output->info()->dimension(2), 1)); - auto kernel = arm_compute::support::cpp14::make_unique(); - kernel->configure(output, extrapolation_value, &slice_fill_rows_after); - CLScheduler::get().enqueue(*kernel); - } -} } // namespace CLCropResize::CLCropResize() - : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results() + : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_kernels() { } @@ -190,9 +79,18 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value); +} + +void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, + InterpolationPolicy method, float extrapolation_value) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind); ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); + auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32); + _num_boxes = boxes->info()->tensor_shape()[1]; TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y); @@ -210,7 +108,13 @@ void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor // - A scale function is used to resize the cropped image to the size specified by crop_size. // - A tensor is required to hold the final scaled image before it is copied into the 4D output // that will hold all final cropped and scaled 3D images using CLCopyKernel. - for(unsigned int i = 0; i < _num_boxes; ++i) + + // The contents of _boxes and _box_ind are required to calculate the shape + // of the initial cropped image and thus are required to configure the + // kernels used for cropping and scaling. + _boxes->map(CLScheduler::get().queue()); + _box_ind->map(CLScheduler::get().queue()); + for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box) { auto crop_tensor = support::cpp14::make_unique(); TensorInfo crop_result_info(1, DataType::F32); @@ -223,45 +127,149 @@ void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor scaled_result_info.set_data_layout(DataLayout::NHWC); scale_tensor->allocator()->init(scaled_result_info); _scaled_results.emplace_back(std::move(scale_tensor)); - } -} -void CLCropResize::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - // The contents of _boxes and _box_ind are required to calculate the shape - // of the initial cropped image and thus are required to configure the - // kernels used for cropping and scaling. - _boxes->map(CLScheduler::get().queue()); - _box_ind->map(CLScheduler::get().queue()); - for(unsigned int i = 0; i < _num_boxes; ++i) - { - // Size of the crop box in _boxes and thus the shape of _crop_results[i] - // may not be known until run-time and so the kernels cannot be configured until then. + // Size of the crop box in _boxes has to be given before the configure uint32_t batch_index; Coordinates start{}; Coordinates end{}; - configure_crop(_input, _boxes, _box_ind, _crop_results[i].get(), i, start, end, batch_index); + configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index); auto scale_kernel = support::cpp14::make_unique(); - scale_kernel->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT); + scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT); _scale.emplace_back(std::move(scale_kernel)); Window win = calculate_max_window(*_output->info()); - win.set(3, Window::Dimension(i, i + 1, 1)); + win.set(3, Window::Dimension(num_box, num_box + 1, 1)); auto copy_kernel = support::cpp14::make_unique(); - copy_kernel->configure(_scaled_results[i].get(), _output, PaddingList(), &win); + copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, PaddingList(), &win); _copy.emplace_back(std::move(copy_kernel)); - _crop_results[i]->allocator()->allocate(); - _scaled_results[i]->allocator()->allocate(); + _crop_results[num_box]->allocator()->allocate(); + _scaled_results[num_box]->allocator()->allocate(); + + bool is_width_flipped = end[0] < start[0]; + bool is_height_flipped = end[1] < start[1]; + /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */ + std::array rows_out_of_bounds{ 0 }; + /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */ + std::array cols_out_of_bounds{ 0 }; + if(is_height_flipped) + { + rows_out_of_bounds[0] = start[1] >= static_cast(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; + rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) : 0; + } + else + { + rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) : 0; + rows_out_of_bounds[1] = end[1] >= static_cast(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; + } + if(is_width_flipped) + { + cols_out_of_bounds[0] = start[0] >= static_cast(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; + cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) : 0; + } + else + { + cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) : 0; + cols_out_of_bounds[1] = end[0] >= static_cast(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; + } + + Window full_window = calculate_max_window(*_crop_results[num_box].get()->info()); + + // Full _crop_results[num_box].get() window: + // -------------------------------- + // | Out of bounds | + // | rows before | + // |------------------------------| + // | Out of | In | Out of | + // | bounds | bounds | bounds | + // | cols | elements | cols | + // | before | copied | after | + // | | from input | | + // |------------------------------| + // | Out of bounds | + // | rows after | + // |------------------------------| + // Use a separate _crop_results[num_box].get() window for each section of the full _crop_results[num_box].get() window. + // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds + // with the extrapolation value using memset. + // First for the rows before the in bounds rows. + if(rows_out_of_bounds[0] > 0) + { + Window slice_fill_rows_before(full_window); + slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1)); + auto kernel = arm_compute::support::cpp14::make_unique(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before); + _internal_kernels.push_back(std::move(kernel)); + } + + Window slice_in(full_window); + slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); + slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); + + int rows_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1]; + if(rows_in_bounds > 0) + { + // Fill all elements that share a row with an in bounds element with the extrapolation value. + if(cols_out_of_bounds[0] > 0) + { + Window slice_fill_cols_before(slice_in); + slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1)); + auto kernel = arm_compute::support::cpp14::make_unique(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before); + _internal_kernels.push_back(std::move(kernel)); + } - run_crop(_input, _crop_results[i].get(), batch_index, start, end, _extrapolation_value); + if(cols_out_of_bounds[1] > 0) + { + Window slice_fill_cols_after(slice_in); + slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1)); + auto kernel = arm_compute::support::cpp14::make_unique(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after); + _internal_kernels.push_back(std::move(kernel)); + } + + // Copy all elements within the input bounds from the input tensor. + int cols_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1]; + if(cols_in_bounds > 0) + { + Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], + is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] }; + Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, + is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 }; + auto kernel = arm_compute::support::cpp14::make_unique(); + + kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in); + _internal_kernels.push_back(std::move(kernel)); + } + } + + // Fill all rows after the in bounds elements with the extrapolation value. + if(rows_out_of_bounds[1] > 0) + { + Window slice_fill_rows_after(full_window); + slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1)); + auto kernel = arm_compute::support::cpp14::make_unique(); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after); + _internal_kernels.push_back(std::move(kernel)); + } } _boxes->unmap(CLScheduler::get().queue()); _box_ind->unmap(CLScheduler::get().queue()); CLScheduler::get().sync(); +} + +void CLCropResize::run() +{ + ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); + + for(unsigned int i = 0; i < _internal_kernels.size(); ++i) + { + CLScheduler::get().enqueue(*(_internal_kernels[i])); + } + + CLScheduler::get().sync(); for(auto &kernel : _scale) { kernel->run(); diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 7aa771428d..62e7d9a582 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,6 +43,12 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr memor void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, const WeightsInfo &weights_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info); +} + +void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -51,14 +57,14 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const case DeconvolutionMethod::DIRECT: { auto f = arm_compute::support::cpp14::make_unique(); - f->configure(input, weights, bias, output, deconv_info, weights_info); + f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info); _function = std::move(f); break; } case DeconvolutionMethod::GEMM: { auto f = arm_compute::support::cpp14::make_unique(_memory_manager); - f->configure(input, weights, bias, output, deconv_info); + f->configure(compile_context, input, weights, bias, output, deconv_info); _function = std::move(f); break; } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index eaf7c66083..be2d120dcd 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,12 +43,17 @@ Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const IT } void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, info); +} + +void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _output = output; - _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); - _upsample.configure(input, _output, info); + _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); + _upsample.configure(compile_context, input, _output, info); } void CLDeconvolutionLayerUpsample::run() diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp index 88e3cbcc17..b848f989e6 100644 --- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -31,9 +31,14 @@ namespace arm_compute { void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift); +} + +void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, policy, shift); + k->configure(compile_context, input, output, policy, shift); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp index 5af979539a..89e5faa4d5 100644 --- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp @@ -31,9 +31,14 @@ namespace arm_compute { void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); +} + +void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, block_shape); + k->configure(compile_context, input, output, block_shape); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 00f6f69771..0b7a33401d 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -125,7 +125,14 @@ CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptrinfo(), @@ -193,11 +207,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( _memory_group.manage(&_permuted_output); // Configure the function to transform the input tensor from NCHW -> NHWC - _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U)); + _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U)); _permuted_input.info()->set_data_layout(DataLayout::NHWC); // Configure the function to transform the weights tensor from IHW -> HWI - _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U)); + _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights, PermutationVector(2U, 0U, 1U)); _permuted_weights.info()->set_data_layout(DataLayout::NHWC); // Set output quantization info before dwc kernel configure @@ -226,7 +240,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1; DWCKernelInfo dwc_info; dwc_info.activation_info = act_info; - _dwc_native_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, + _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers_to_use, output_shifts_to_use); @@ -236,7 +250,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( // Configure the function to transform the convoluted output to NCHW format _permuted_output.info()->set_data_layout(DataLayout::NCHW); - _permute_output_to_nchw.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U)); + _permute_output_to_nchw.configure(compile_context, &_permuted_output, output, PermutationVector(1U, 2U, 0U)); _permuted_output.allocator()->allocate(); } @@ -385,6 +399,13 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwise void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); +} + +void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { const GPUTarget gpu_target = CLScheduler::get().target(); @@ -429,11 +450,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config _memory_group.manage(&_permuted_output); // Configure the function to transform the input tensor from NHWC -> NCHW - _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permute_input_to_nchw.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U)); _permuted_input.info()->set_data_layout(DataLayout::NCHW); // Configure the function to transform the weights tensor from HWI -> IHW - _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); + _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); _permuted_weights.info()->set_data_layout(DataLayout::NCHW); _permuted_output.info()->set_quantization_info(output->info()->quantization_info()); @@ -447,7 +468,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config { if(_needs_weights_reshape) { - _reshape_weights.configure(weights, &_permuted_weights, info); + _reshape_weights.configure(compile_context, weights, &_permuted_weights, info); weights_to_use = &_permuted_weights; } _kernel = arm_compute::support::cpp14::make_unique(); @@ -473,7 +494,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config // Configure kernel _kernel->set_target(gpu_target); - _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, + _kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation, output_multipliers_to_use, output_shifts_to_use); if(_is_quantized) @@ -487,7 +508,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); - _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); + _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U)); // Allocate tensors _permuted_input.allocator()->allocate(); @@ -499,7 +520,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config { zero_value = PixelValue(static_cast(input->info()->quantization_info().uniform().offset)); } - _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value); + _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value); } Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, @@ -574,6 +595,14 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptrinfo(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, @@ -582,12 +611,12 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w { case DepthwiseConvolutionFunction::OPTIMIZED: _func_3x3.set_memory_group(_memory_manager); - _func_3x3.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); break; case DepthwiseConvolutionFunction::GENERIC: { _func_generic.set_memory_group(_memory_manager); - _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); } break; default: diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp index 6e4aab2eb9..362b36cc95 100644 --- a/src/runtime/CL/functions/CLDequantizationLayer.cpp +++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp @@ -29,9 +29,14 @@ namespace arm_compute { void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp index 799724bf2d..68d3752463 100644 --- a/src/runtime/CL/functions/CLDerivative.cpp +++ b/src/runtime/CL/functions/CLDerivative.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); +} + +void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp index 5962c1dc88..05351a9de3 100644 --- a/src/runtime/CL/functions/CLDilate.cpp +++ b/src/runtime/CL/functions/CLDilate.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); +} + +void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 9828cd10ec..6e9782f77a 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -38,12 +38,19 @@ CLDirectConvolutionLayer::CLDirectConvolutionLayer() } void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); +} + +void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { // Set GPU target _direct_conv_kernel.set_target(CLScheduler::get().target()); // Configure direct convolution - _direct_conv_kernel.configure(input, weights, biases, output, conv_info); + _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info); // Configure border handler PixelValue &&zero_value(0.f); @@ -51,7 +58,7 @@ void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weig { zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); } - _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value); + _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value); // Tune kernels CLScheduler::get().tune_kernel_static(_direct_conv_kernel); @@ -61,7 +68,7 @@ void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weig //Configure Activation Layer if(_is_activationlayer_enabled) { - _activationlayer_function.configure(output, nullptr, act_info); + _activationlayer_function.configure(compile_context, output, nullptr, act_info); } } diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index 8d90723c95..da16bed3e0 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -87,10 +87,10 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid."); - unsigned int deconv_pad_x = 0; - unsigned int deconv_pad_y = 0; - const unsigned int stride_x = info.stride().first; - const unsigned int stride_y = info.stride().second; + unsigned int deconv_pad_x = 0; + unsigned int deconv_pad_y = 0; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); @@ -103,6 +103,12 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, const WeightsInfo &weights_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info); +} + +void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -110,8 +116,8 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const unsigned int pad_right = info.pad_right(); const unsigned int pad_top = info.pad_top(); const unsigned int pad_bottom = info.pad_bottom(); - const unsigned int stride_x = info.stride().first; - const unsigned int stride_y = info.stride().second; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; const DataLayout data_layout = input->info()->data_layout(); @@ -121,7 +127,7 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, _original_weights = weights; _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); - _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); + _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); @@ -146,14 +152,14 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0; deconv_pad_x -= deconv_pad_left + deconv_pad_right; ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0); - deconv_pad_left += deconv_pad_x / 2; + deconv_pad_left += deconv_pad_x / 2; deconv_pad_right += deconv_pad_x / 2; unsigned int deconv_pad_top = pad_bottom > pad_top ? pad_bottom - pad_top : 0; unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0; deconv_pad_y -= deconv_pad_top + deconv_pad_bottom; ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0); - deconv_pad_top += deconv_pad_y / 2; + deconv_pad_top += deconv_pad_y / 2; deconv_pad_bottom += deconv_pad_y / 2; TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); @@ -162,11 +168,11 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, // configure scale function const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); - _scale_f.configure(input, &_scaled_output, upsample_info); + _scale_f.configure(compile_context, input, &_scaled_output, upsample_info); // Setup the function to convolve the upscaled output const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); + _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); _scaled_output.allocator()->allocate(); // Setup flip axis data diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp index 9955d240f9..ce615327a9 100644 --- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp +++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp @@ -31,9 +31,14 @@ namespace arm_compute { void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, ElementWiseUnary::RSQRT); + k->configure(compile_context, input, output, ElementWiseUnary::RSQRT); _kernel = std::move(k); } Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output) @@ -42,9 +47,14 @@ Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *outpu } void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, ElementWiseUnary::EXP); + k->configure(compile_context, input, output, ElementWiseUnary::EXP); _kernel = std::move(k); } Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output) @@ -53,9 +63,14 @@ Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output) } void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, ElementWiseUnary::NEG); + k->configure(compile_context, input, output, ElementWiseUnary::NEG); _kernel = std::move(k); } Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) @@ -64,9 +79,14 @@ Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) } void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, ElementWiseUnary::SIN); + k->configure(compile_context, input, output, ElementWiseUnary::SIN); _kernel = std::move(k); } Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output) @@ -75,9 +95,14 @@ Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output) } void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, ElementWiseUnary::ABS); + k->configure(compile_context, input, output, ElementWiseUnary::ABS); _kernel = std::move(k); } Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output) @@ -85,9 +110,14 @@ Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output) return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS); } void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, ElementWiseUnary::LOG); + k->configure(compile_context, input, output, ElementWiseUnary::LOG); _kernel = std::move(k); } Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output) @@ -96,9 +126,14 @@ Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output) } void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, ElementWiseUnary::ROUND); + k->configure(compile_context, input, output, ElementWiseUnary::ROUND); _kernel = std::move(k); } Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 7636a87e93..20e9545b61 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -33,7 +33,7 @@ namespace arm_compute { namespace { -void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output) +void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output) { if(output->info()->dimension(0) > 1) { @@ -41,18 +41,23 @@ void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize bor if(broadcasted_info->info()->dimension(0) == 1) { - border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE); + border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE); } } } } // namespace void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); +} + +void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::ADD, input1, input2, output, policy, act_info); + k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) @@ -61,11 +66,16 @@ Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorIn } void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); +} + +void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::SUB, input1, input2, output, policy, act_info); + k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) @@ -75,11 +85,16 @@ Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITenso } void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); +} + +void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::DIV, input1, input2, output, act_info); + k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -88,11 +103,16 @@ Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorIn } void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); +} + +void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::MAX, input1, input2, output, act_info); + k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -101,11 +121,16 @@ Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo * } void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); +} + +void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::MIN, input1, input2, output, act_info); + k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -114,11 +139,16 @@ Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo * } void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); +} + +void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info); + k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -127,11 +157,16 @@ Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITens } void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); +} + +void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::POWER, input1, input2, output, act_info); + k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp index a0663b754a..e1bd7e6f2a 100644 --- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp +++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -89,9 +89,14 @@ CLEqualizeHistogram::CLEqualizeHistogram() void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output) { - _histogram_kernel.configure(input, &_hist); - _border_histogram_kernel.configure(input, &_hist); - _map_histogram_kernel.configure(input, &_cd_lut, output); + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output) +{ + _histogram_kernel.configure(compile_context, input, &_hist); + _border_histogram_kernel.configure(compile_context, input, &_hist); + _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output); } void CLEqualizeHistogram::run() diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp index 48d7be385d..8106148316 100644 --- a/src/runtime/CL/functions/CLErode.cpp +++ b/src/runtime/CL/functions/CLErode.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); +} + +void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp index 49b5a2a2e6..c3922f5e66 100644 --- a/src/runtime/CL/functions/CLFFT1D.cpp +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -36,6 +36,11 @@ CLFFT1D::CLFFT1D(std::shared_ptr memory_manager) } void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, config); +} + +void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config)); @@ -57,7 +62,7 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; @@ -72,7 +77,7 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn fft_kernel_info.radix = radix_for_stage; fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); - _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); Nx *= radix_for_stage; } @@ -83,7 +88,7 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn FFTScaleKernelInfo scale_config; scale_config.scale = static_cast(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config); } // Allocate tensors diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp index f5776cbd88..2482ea901a 100644 --- a/src/runtime/CL/functions/CLFFT2D.cpp +++ b/src/runtime/CL/functions/CLFFT2D.cpp @@ -35,6 +35,11 @@ CLFFT2D::CLFFT2D(std::shared_ptr memory_manager) } void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, config); +} + +void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config)); @@ -44,13 +49,13 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn first_pass_config.axis = config.axis0; first_pass_config.direction = config.direction; _memory_group.manage(&_first_pass_tensor); - _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config); + _first_pass_func.configure(compile_context, input, &_first_pass_tensor, first_pass_config); // Setup second pass FFT1DInfo second_pass_config; second_pass_config.axis = config.axis1; second_pass_config.direction = config.direction; - _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config); + _second_pass_func.configure(compile_context, &_first_pass_tensor, output, second_pass_config); _first_pass_tensor.allocator()->allocate(); } diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp index afb1cab520..ff439cca8d 100644 --- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -97,6 +97,12 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr mem void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); +} + +void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { _original_weights = weights; _original_bias = biases; @@ -121,7 +127,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights // Permute bias if(biases != nullptr) { - _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); + _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); } @@ -131,11 +137,11 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW - _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permute_input_func.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U)); _permuted_input.info()->set_data_layout(DataLayout::NCHW); // Configure the function to transform the weights tensor from HWI -> IHW - _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); + _permute_weights_func.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); _permuted_weights.info()->set_data_layout(DataLayout::NCHW); input_to_use = &_permuted_input; @@ -145,20 +151,20 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights // Flip weights _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); - _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis); + _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; - _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w); + _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w); // Transform weights _transform_weights_func = support::cpp14::make_unique(); - _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo()); + _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; _memory_group.manage(&_padded_input); - _pad_input_func.configure(input_to_use, &_padded_input, padding_in); + _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in); if(_needs_permute) { _permuted_input.allocator()->allocate(); @@ -166,17 +172,17 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights // Transform input _memory_group.manage(&_transformed_input); - _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo()); + _transform_input_func.configure(compile_context, &_padded_input, &_transformed_input, FFT2DInfo()); _padded_input.allocator()->allocate(); // Perform product _memory_group.manage(&_output_product); - _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product); + _prod_func.configure(compile_context, &_transformed_input, &_transformed_weights, &_output_product); _transformed_input.allocator()->allocate(); // Perform reduction _memory_group.manage(&_output_reduced); - _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM); + _reduce_func.configure(compile_context, &_output_product, &_output_reduced, 2, ReductionOperation::SUM); _output_product.allocator()->allocate(); // Transform output @@ -184,7 +190,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); - _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info); + _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); // Reshape output @@ -206,7 +212,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); _itransformed_output.allocator()->allocate(); // Add bias @@ -219,7 +225,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights _memory_group.manage(&_permuted_output); } auto_init_if_empty(*output_to_use->info(), *_bias_output.info()); - _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP); + _bias_add_func.configure(compile_context, &_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP); _bias_output.allocator()->allocate(); } @@ -228,7 +234,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); - _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); + _permute_output_func.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U)); // Allocate tensors _permuted_output.allocator()->allocate(); @@ -238,7 +244,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights _is_activationlayer_enabled = act_info.enabled(); if(_is_activationlayer_enabled) { - _activation_layer_func.configure(output, nullptr, act_info); + _activation_layer_func.configure(compile_context, output, nullptr, act_info); } // Setup flip axis data diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp index fe2a18cd30..f51abf0880 100644 --- a/src/runtime/CL/functions/CLFastCorners.cpp +++ b/src/runtime/CL/functions/CLFastCorners.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,12 @@ CLFastCorners::CLFastCorners(std::shared_ptr memory_manager) void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value); +} + +void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, + unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode); @@ -72,19 +78,19 @@ void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonma const bool update_number = (nullptr != _num_corners); _memory_group.manage(&_output); - _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode); + _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode); if(!_non_max) { - _copy_array_kernel.configure(&_output, update_number, _corners, &_num_buffer); + _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer); } else { _suppr.allocator()->init(tensor_info); _memory_group.manage(&_suppr); - _suppr_func.configure(&_output, &_suppr, border_mode); - _copy_array_kernel.configure(&_suppr, update_number, _corners, &_num_buffer); + _suppr_func.configure(compile_context, &_output, &_suppr, border_mode); + _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer); _suppr.allocator()->allocate(); } diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp index 035bb7ce8d..7b96ed1592 100644 --- a/src/runtime/CL/functions/CLFill.cpp +++ b/src/runtime/CL/functions/CLFill.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,11 +30,15 @@ namespace arm_compute { - void CLFill::configure(ICLTensor *tensor, PixelValue constant_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value); +} + +void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(tensor, constant_value); + k->configure(compile_context, tensor, constant_value); _kernel = std::move(k); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp index b7d77bcbb7..f9d7396c5b 100644 --- a/src/runtime/CL/functions/CLFillBorder.cpp +++ b/src/runtime/CL/functions/CLFillBorder.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value); +} + +void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value); + k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index 8f93e03c5e..9a247ccfcb 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -30,9 +30,14 @@ using namespace arm_compute; void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLFlattenLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); CLScheduler::get().tune_kernel_static(*_kernel); } diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp index 204ca7400c..44e1d39dc2 100644 --- a/src/runtime/CL/functions/CLFloor.cpp +++ b/src/runtime/CL/functions/CLFloor.cpp @@ -33,7 +33,7 @@ void CLFloor::configure(const ICLTensor *input, ICLTensor *output) configure(CLKernelLibrary::get().get_compile_context(), input, output); } -void CLFloor::configure(CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +void CLFloor::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output); diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 0c0fbd5c9d..ecbac6f703 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -146,9 +146,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } // namespace void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLFullyConnectedLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } @@ -163,7 +168,8 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr mem _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) { } -void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info) +void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) { GEMMLowpOutputStageInfo gemmlowp_output_stage; construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info); @@ -190,7 +196,7 @@ void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function - _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); + _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info); // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers input->info()->set_quantization_info(input_quantization_info); @@ -199,11 +205,12 @@ void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor else { // Configure matrix multiply kernel - _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info); + _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info); } } -void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info) +void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) { ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); @@ -215,25 +222,32 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT // Configure flatten kernel _memory_group.manage(&_flatten_output); - _flatten_layer.configure(input, &_flatten_output); + _flatten_layer.configure(compile_context, input, &_flatten_output); // Configure matrix multiply kernel - configure_mm(&_flatten_output, weights, bias, output, fc_info); + configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info); // Allocate the output tensor for flatten once all the configure methods have been called _flatten_output.allocator()->allocate(); } -void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info) +void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) { ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); // Configure matrix multiply kernel - configure_mm(input, weights, bias, output, fc_info); + configure_mm(compile_context, input, weights, bias, output, fc_info); } void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, FullyConnectedLayerInfo fc_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info); +} + +void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + FullyConnectedLayerInfo fc_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -282,13 +296,13 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w { if(_weights_manager && _weights_manager->are_weights_managed(weights)) { - _reshape_weights_managed_function.configure(weights); + _reshape_weights_managed_function.configure(compile_context, weights); weights_to_use = utils::cast::polymorphic_downcast(_weights_manager->acquire(weights, &_reshape_weights_managed_function)); } else { // Reshape the weights - _reshape_weights_function.configure(weights, &_reshape_weights_output); + _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output); weights_to_use = &_reshape_weights_output; } } @@ -298,7 +312,7 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w { if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) { - _convert_weights_managed.configure(weights_to_use, + _convert_weights_managed.configure(compile_context, weights_to_use, input->info()->tensor_shape(), fc_info.weights_trained_layout); weights_to_use = utils::cast::polymorphic_downcast(_weights_manager->acquire(weights, &_convert_weights_managed)); @@ -306,7 +320,7 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w else { // Convert weights - _convert_weights.configure(weights_to_use, + _convert_weights.configure(compile_context, weights_to_use, &_converted_weights_output, input->info()->tensor_shape(), fc_info.weights_trained_layout); @@ -319,12 +333,12 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w if(_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(input, weights_to_use, biases, output, fc_info); + configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info); } else { // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(input, weights_to_use, biases, output, fc_info); + configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info); } } diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp index 72dd27e3cc..6deecdc089 100644 --- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp +++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,7 +41,15 @@ void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const I const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, float epsilon, FuseBatchNormalizationType fbn_type) { - _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); +} + +void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, + ICLTensor *fused_weights, ICLTensor *fused_bias, + const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, + float epsilon, FuseBatchNormalizationType fbn_type) +{ + _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 74d59cdad1..8466024c04 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -81,7 +81,8 @@ CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsi return gemm_kernel->select_kernel(params); } -void CLGEMM::configure_native_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, + const GEMMInfo &gemm_info) { const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); const unsigned int n = b->info()->dimension(0); @@ -94,13 +95,14 @@ void CLGEMM::configure_native_v1(const ICLTensor *a, const ICLTensor *b, const I GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias()); // Configure and tune matrix multiply kernel - _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); + _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); // Tune kernel statically CLScheduler::get().tune_kernel_static(_mm_kernel); } -void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, + const GEMMInfo &gemm_info) { bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); @@ -148,22 +150,22 @@ void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const } // Configure interleave kernel - _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d); + _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d); // Configure transpose kernel ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) { - _reshape_rhs_kernel_managed.configure(b, rhs_info); + _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); } else { - _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info); + _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); } // Configure and tune matrix multiply kernel - _mm_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); + _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); CLScheduler::get().tune_kernel_static(_mm_kernel); @@ -176,7 +178,8 @@ void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const } } -void CLGEMM::configure_reshaped(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->info()->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); @@ -223,21 +226,21 @@ void CLGEMM::configure_reshaped(const ICLTensor *a, const ICLTensor *b, const IC // Configure lhs_info and rhs_info std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); - _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); + _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) { - _reshape_rhs_kernel_managed.configure(b, rhs_info); + _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); } else { - _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info); + _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); } // Configure and tune matrix multiply kernel - _mm_reshaped_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); // Allocate intermediate tensors _tmp_a.allocator()->allocate(); @@ -248,7 +251,8 @@ void CLGEMM::configure_reshaped(const ICLTensor *a, const ICLTensor *b, const IC } } -void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->info()->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); @@ -293,16 +297,16 @@ void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) { - _reshape_rhs_kernel_managed.configure(b, rhs_info); + _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); } else { - _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info); + _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); } // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel.configure(a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); if(!_reshape_b_only_on_first_run && use_mm_b) { @@ -483,6 +487,11 @@ Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf } void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info); +} + +void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -511,22 +520,22 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * { case CLGEMMKernelType::NATIVE_V1: { - configure_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info); + configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); break; } case CLGEMMKernelType::RESHAPED_V1: { - configure_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info); + configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); break; } case CLGEMMKernelType::RESHAPED: { - configure_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info); + configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); break; } case CLGEMMKernelType::RESHAPED_ONLY_RHS: { - configure_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info); + configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); break; } default: diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 5398050533..1c37993bda 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -47,6 +47,11 @@ CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights() } void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups) +{ + configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups); +} + +void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output); @@ -58,7 +63,7 @@ void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr; - _weights_reshape_kernel.configure(weights, biases_to_use, output, num_groups); + _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups); output->info()->set_quantization_info(weights->info()->quantization_info()); } @@ -100,7 +105,8 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr m { } -void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, +void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); @@ -127,7 +133,7 @@ void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTenso input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - _mm_gemmlowp.configure(input, weights, biases, output, gemm_info); + _mm_gemmlowp.configure(compile_context, input, weights, biases, output, gemm_info); // Revert back QuantizatioInfo as input and weights could be used in other convolution layers input->info()->set_quantization_info(input_quantization_info); @@ -136,7 +142,7 @@ void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTenso else { // Configure matrix multiply function - _mm_gemm.configure(input, weights, biases, output, 1.0f, 1.0f, gemm_info); + _mm_gemm.configure(compile_context, input, weights, biases, output, 1.0f, 1.0f, gemm_info); } } @@ -180,6 +186,13 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); +} + +void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -252,24 +265,24 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * if(_weights_manager && _weights_manager->are_weights_managed(weights)) { - _reshape_weights_managed.configure(weights, biases, num_groups); + _reshape_weights_managed.configure(compile_context, weights, biases, num_groups); weights_to_use = utils::cast::polymorphic_downcast(_weights_manager->acquire(weights, &_reshape_weights_managed)); } else { - _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups); + _reshape_weights.configure(compile_context, weights, biases, &_weights_reshaped, num_groups); } } else { if(_weights_manager && _weights_manager->are_weights_managed(weights)) { - _reshape_weights_managed.configure(weights, nullptr, num_groups); + _reshape_weights_managed.configure(compile_context, weights, nullptr, num_groups); weights_to_use = utils::cast::polymorphic_downcast(_weights_manager->acquire(weights, &_reshape_weights_managed)); } else { - _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups); + _reshape_weights.configure(compile_context, weights, nullptr, &_weights_reshaped, num_groups); } } @@ -279,7 +292,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * _memory_group.manage(&_im2col_output); // Configure and tune im2col. im2col output shape is auto-initialized - _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups); + _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups); // Set quantization info _im2col_output.info()->set_quantization_info(input->info()->quantization_info()); @@ -367,7 +380,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - configure_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info); + configure_mm(compile_context, gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info); if(!_skip_im2col) { @@ -377,7 +390,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * if(!_skip_col2im) { // Configure and tune Col2Im - _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups); + _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups); CLScheduler::get().tune_kernel_static(_col2im_kernel); } @@ -391,7 +404,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * if(!_fuse_activation) { - _activationlayer_function.configure(output, nullptr, act_info); + _activationlayer_function.configure(compile_context, output, nullptr, act_info); } ARM_COMPUTE_UNUSED(weights_info); diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index 3298858215..1dcb341fe7 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -28,7 +28,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "utils/TypePrinter.h" #include #include @@ -64,29 +63,29 @@ std::pair compute_start_end_slice_coordinates(const IT } Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info) { - const auto data_type = input->data_type(); + const auto data_type = input->data_type(); - if(is_data_type_quantized_asymmetric(data_type)) - { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); - - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier(0); - int output_shift(0); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - - output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_stage_info.gemmlowp_multiplier = output_multiplier; - output_stage_info.gemmlowp_shift = output_shift; - output_stage_info.gemmlowp_offset = oq_info.offset; - const auto min_max_bound = get_min_max(data_type); - output_stage_info.gemmlowp_min_bound = (std::get<0>(min_max_bound)).get(); - output_stage_info.gemmlowp_max_bound = (std::get<1>(min_max_bound)).get(); - output_stage_info.output_data_type = data_type; - } - return Status{}; + if(is_data_type_quantized_asymmetric(data_type)) + { + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); + + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + int output_multiplier(0); + int output_shift(0); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_stage_info.gemmlowp_multiplier = output_multiplier; + output_stage_info.gemmlowp_shift = output_shift; + output_stage_info.gemmlowp_offset = oq_info.offset; + const auto min_max_bound = get_min_max(data_type); + output_stage_info.gemmlowp_min_bound = (std::get<0>(min_max_bound)).get(); + output_stage_info.gemmlowp_max_bound = (std::get<1>(min_max_bound)).get(); + output_stage_info.output_data_type = data_type; + } + return Status{}; } } // namespace @@ -175,7 +174,6 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32), gemm_info)); ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info)); - } else { @@ -214,6 +212,12 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso } void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info); +} + +void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(), @@ -237,9 +241,9 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor if(_is_nchw) { _memory_group.manage(&_permuted_input); - _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U)); + _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U)); - _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U)); + _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights, PermutationVector(2U, 0U, 1U)); input_to_use = &_permuted_input; weights_to_use = &_permuted_weights; @@ -251,8 +255,8 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor 1, input->info()->data_type(), weights->info()->quantization_info())); - _reshape_weights.configure(weights_to_use, &_reshaped_weights); - _transpose_weights.configure(&_reshaped_weights, &_reshaped_weights_t); + _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights); + _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t); const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true); @@ -268,14 +272,14 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); - _mm_gemmlowp.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info); + _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info); input_to_use->info()->set_quantization_info(iq_info); _reshaped_weights_t.info()->set_quantization_info(wq_info); } else { - _mm_gemm.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info); + _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info); } if(_is_nchw) @@ -313,14 +317,14 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor } // Configure a Col2Im call to reshape the output of GEMM - _deconv_reshape.configure(&_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); + _deconv_reshape.configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); _gemm_output.allocator()->allocate(); if(_is_quantized) { GEMMLowpOutputStageInfo output_stage_info; construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info); - _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_stage_info); + _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info); _gemmlowp_final.allocator()->allocate(); } @@ -328,7 +332,7 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor if(_padded_input) { const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); - _slice_gemm.configure(&_slice_gemm_input, slice_output, start_end.first, start_end.second); + _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second); _slice_gemm_input.allocator()->allocate(); } } diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 3465da95b7..84da4a7e98 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -100,6 +100,11 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); @@ -144,7 +149,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor TensorInfo weights_info(*b->info()); weights_info.set_data_type(DataType::QASYMM8); _qasymm8_weights.allocator()->init(weights_info); - _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0); + _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0); } const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; @@ -162,7 +167,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure reshape RHS kernel - _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); } // Using default reduction info @@ -179,7 +184,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -190,7 +195,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor _memory_group.manage(&_vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, reduction_info); + _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info); } GEMMKernelInfo gemm_kernel_info; @@ -220,7 +225,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } else @@ -231,7 +236,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_is_gemm_reshaped) { - _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); } else { @@ -239,11 +244,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure matrix multiply kernel - _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0), + _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, + a->info()->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); - _mm_result_s32.allocator()->allocate(); } } @@ -264,7 +269,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_is_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info); } else { @@ -272,11 +277,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure matrix multiply kernel - _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } // Configure offset contribution kernel - _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset); + _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, + _b_offset); } // Allocate tensors diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index aff7f54a82..18e002aa3d 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -43,7 +43,23 @@ void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, info.gemmlowp_max_bound = max; auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, &info); + k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info); + _kernel = std::move(k); +} + +void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, + int result_mult_int, + int result_shift, int min, int max) +{ + GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); + info.gemmlowp_offset = result_offset; + info.gemmlowp_multiplier = result_mult_int; + info.gemmlowp_shift = result_shift; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(compile_context, input, bias, output, &info); _kernel = std::move(k); } @@ -59,9 +75,16 @@ Status CLGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *inpu void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min, int max) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); +} + +void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, + int min, int max) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); + k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); _kernel = std::move(k); } @@ -76,7 +99,16 @@ void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTens int min, int max) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); + k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); + _kernel = std::move(k); +} + +void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, + int min, int max) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); _kernel = std::move(k); } @@ -97,7 +129,22 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor * info.gemmlowp_max_bound = max; auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, &info); + k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info); + _kernel = std::move(k); +} + +void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, + float multiplier, int offset, + int min, int max) +{ + GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo(); + info.gemmlowp_offset = offset; + info.gemmlowp_real_multiplier = multiplier; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(compile_context, input, bias, output, &info); _kernel = std::move(k); } @@ -113,9 +160,16 @@ Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInf void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); +} + +void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, + int result_fixedpoint_multiplier, int result_shift, + int min, int max) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); + k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); _kernel = std::move(k); } @@ -126,6 +180,11 @@ Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITens } void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); +} + +void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -138,14 +197,14 @@ void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *b case DataType::QASYMM8: { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } case DataType::QASYMM8_SIGNED: { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } @@ -164,14 +223,14 @@ void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *b case GEMMLowpOutputStageType::QUANTIZE_DOWN: { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, &info); + k->configure(compile_context, input, bias, output, &info); _kernel = std::move(k); break; } case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, &info); + k->configure(compile_context, input, bias, output, &info); _kernel = std::move(k); break; } diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp index 08ec4e02d1..e2b18e0f55 100644 --- a/src/runtime/CL/functions/CLGather.cpp +++ b/src/runtime/CL/functions/CLGather.cpp @@ -30,9 +30,14 @@ namespace arm_compute { void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); +} + +void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, indices, output, axis); + k->configure(compile_context, input, indices, output, axis); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp index c187891b12..47367c4b17 100644 --- a/src/runtime/CL/functions/CLGaussian3x3.cpp +++ b/src/runtime/CL/functions/CLGaussian3x3.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); +} + +void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp index ea803e4796..6b82cd0c35 100644 --- a/src/runtime/CL/functions/CLGaussian5x5.cpp +++ b/src/runtime/CL/functions/CLGaussian5x5.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,6 +41,11 @@ CLGaussian5x5::CLGaussian5x5(std::shared_ptr memory_manager) } void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); +} + +void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); @@ -50,9 +55,9 @@ void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode bo _memory_group.manage(&_tmp); // Configure kernels - _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED); + _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED); + _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); // Allocate intermediate buffers _tmp.allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp index 3cda1be2df..1ac98787ac 100644 --- a/src/runtime/CL/functions/CLGaussianPyramid.cpp +++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp @@ -56,6 +56,11 @@ CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT } void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value); +} + +void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON(pyramid == nullptr); @@ -90,16 +95,16 @@ void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, Bord for(size_t i = 0; i < num_levels - 1; ++i) { /* Configure horizontal kernel */ - _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i)); + _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i)); /* Configure vertical kernel */ - _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1)); + _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1)); /* Configure border */ - _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value)); + _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value)); /* Configure border */ - _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16)); + _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16)); } _tmp.allocate(); } @@ -136,6 +141,11 @@ CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT } void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value); +} + +void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON(nullptr == pyramid); @@ -162,10 +172,10 @@ void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, Borde for(size_t i = 0; i < num_levels - 1; ++i) { /* Configure gaussian 5x5 */ - _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value); + _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value); /* Configure scale image kernel */ - _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER); + _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER); } _tmp.allocate(); diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp index 7d16753320..7f037fc51f 100644 --- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp +++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp @@ -63,6 +63,13 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptrinfo(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); @@ -84,7 +91,7 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso // Compute all the anchors _memory_group.manage(&_all_anchors); - _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors_kernel.configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); @@ -94,13 +101,13 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso if(!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); - _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened); + _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _flatten_deltas_kernel.configure(compile_context, &_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } else { - _flatten_deltas_kernel.configure(deltas, &_deltas_flattened); + _flatten_deltas_kernel.configure(compile_context, deltas, &_deltas_flattened); } const TensorShape flatten_shape_scores(1, total_num_anchors); @@ -111,13 +118,13 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso if(!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); - _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened); + _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _flatten_scores_kernel.configure(compile_context, &_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } else { - _flatten_scores_kernel.configure(scores, &_scores_flattened); + _flatten_scores_kernel.configure(compile_context, scores, &_scores_flattened); } CLTensor *anchors_to_use = &_all_anchors; @@ -129,18 +136,18 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso _memory_group.manage(&_all_anchors_f32); _memory_group.manage(&_deltas_flattened_f32); // Dequantize anchors to float - _dequantize_anchors.configure(&_all_anchors, &_all_anchors_f32); + _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32); _all_anchors.allocator()->allocate(); anchors_to_use = &_all_anchors_f32; // Dequantize deltas to float - _dequantize_deltas.configure(&_deltas_flattened, &_deltas_flattened_f32); + _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32); _deltas_flattened.allocator()->allocate(); deltas_to_use = &_deltas_flattened_f32; } // Bounding box transform _memory_group.manage(&_all_proposals); BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f); - _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); + _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); deltas_to_use->allocator()->allocate(); anchors_to_use->allocator()->allocate(); @@ -150,7 +157,7 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); - _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized); + _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; } @@ -185,7 +192,7 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); _proposals_4_roi_values.allocator()->allocate(); } diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp index 09314439a8..0645cfdf22 100644 --- a/src/runtime/CL/functions/CLHOGDescriptor.cpp +++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,6 +37,11 @@ CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr memory_manager) } void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value); +} + +void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON(nullptr == output); @@ -76,16 +81,16 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG _memory_group.manage(&_phase); // Initialise gradient kernel - _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value); + _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value); // Manage intermediate buffers _memory_group.manage(&_hog_space); // Initialise orientation binning kernel - _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info()); + _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info()); // Initialize HOG norm kernel - _block_norm.configure(&_hog_space, output, hog->info()); + _block_norm.configure(compile_context, &_hog_space, output, hog->info()); // Allocate intermediate tensors _mag.allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp index 8eb5e4251f..bf9bae1e8b 100644 --- a/src/runtime/CL/functions/CLHOGDetector.cpp +++ b/src/runtime/CL/functions/CLHOGDetector.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,6 +37,12 @@ CLHOGDetector::CLHOGDetector() } void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class); +} + +void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, + float threshold, size_t idx_class) { _detection_windows = detection_windows; @@ -44,7 +50,7 @@ void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDete _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int)); // Configure HOGDetectorKernel - _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class); + _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class); } void CLHOGDetector::run() diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp index e509fd8e36..acf5f2c568 100644 --- a/src/runtime/CL/functions/CLHOGGradient.cpp +++ b/src/runtime/CL/functions/CLHOGGradient.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -35,6 +35,12 @@ CLHOGGradient::CLHOGGradient(std::shared_ptr memory_manager) } void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value); +} + +void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, + uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16); @@ -52,16 +58,16 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL _memory_group.manage(&_gy); // Initialise derivate kernel - _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value); + _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); // Initialise magnitude/phase kernel if(PhaseType::UNSIGNED == phase_type) { - _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED); + _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED); } else { - _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED); + _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED); } // Allocate intermediate tensors diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp index 54ad1b35bf..248f7307e6 100644 --- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp +++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp @@ -54,6 +54,14 @@ CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr memory_ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode, uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression, + min_distance); +} + +void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, + ICLSize2DArray *detection_window_strides, BorderMode border_mode, + uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog); @@ -145,7 +153,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h _memory_group.manage(&_phase); // Initialise gradient kernel - _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value); + _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value); // Configure NETensor for the HOG space and orientation binning kernel for(size_t i = 0; i < _num_orient_bin_kernel; ++i) @@ -173,7 +181,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h _memory_group.manage(&_hog_space[i]); // Initialise orientation binning kernel - _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info()); + _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info()); } // Allocate intermediate tensors @@ -194,7 +202,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h _memory_group.manage(&_hog_norm_space[i]); // Initialize block normalization kernel - _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info()); + _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info()); } // Allocate intermediate tensors @@ -210,7 +218,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h { const size_t idx_block_norm = input_hog_detect[i]; - _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i); + _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i); } detection_window_strides->unmap(CLScheduler::get().queue()); diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp index 3cae95f551..aecec0d3c5 100644 --- a/src/runtime/CL/functions/CLHarrisCorners.cpp +++ b/src/runtime/CL/functions/CLHarrisCorners.cpp @@ -64,6 +64,13 @@ CLHarrisCorners::CLHarrisCorners(std::shared_ptr memory_manager) void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist, float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16); +} + +void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist, + float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, + BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) { ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); @@ -96,21 +103,21 @@ void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist case 3: { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); + k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); _sobel = std::move(k); break; } case 5: { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); + k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); _sobel = std::move(k); break; } case 7: { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, &_gx, &_gy, border_mode, constant_border_value); + k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); _sobel = std::move(k); break; } @@ -126,11 +133,11 @@ void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist _memory_group.manage(&_score); // Set/init Harris Score kernel accordingly with block_size - _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); + _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); // Configure border filling using harris score kernel's block size - _border_gx.configure(&_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); - _border_gy.configure(&_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); + _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); + _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); // Allocate intermediate buffers _gx.allocator()->allocate(); @@ -140,7 +147,7 @@ void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist _memory_group.manage(&_nonmax); // Init non-maxima suppression function - _non_max_suppr.configure(&_score, &_nonmax, border_mode); + _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode); // Allocate intermediate buffers _score.allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp index eb543387f5..e723024334 100644 --- a/src/runtime/CL/functions/CLHistogram.cpp +++ b/src/runtime/CL/functions/CLHistogram.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -34,8 +34,13 @@ CLHistogram::CLHistogram() void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output) { - _kernel.configure(input, output); - _kernel_border.configure(input, output); + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output) +{ + _kernel.configure(compile_context, input, output); + _kernel_border.configure(compile_context, input, output); } void CLHistogram::run() diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp index e639e74394..273a873c81 100644 --- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp @@ -33,9 +33,14 @@ CLInstanceNormalizationLayer::CLInstanceNormalizationLayer() } void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision); +} + +void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + k->configure(compile_context, input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp index 2d54be32fa..b3be2f8c2c 100644 --- a/src/runtime/CL/functions/CLIntegralImage.cpp +++ b/src/runtime/CL/functions/CLIntegralImage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -35,8 +35,13 @@ CLIntegralImage::CLIntegralImage() void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output) { - _integral_hor.configure(input, output); - _integral_vert.configure(output); + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +{ + _integral_hor.configure(compile_context, input, output); + _integral_vert.configure(compile_context, output); } void CLIntegralImage::run() diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp index 95a62c5317..14c83cd543 100644 --- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -44,6 +44,11 @@ CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr memory_ma } void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon); +} + +void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) { // Reset auxiliary tensor _sumsq.allocator()->init(TensorInfo()); @@ -53,8 +58,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis // Configure kernels const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE); - _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); + _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE); + _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon); // Allocate intermediate tensor _sumsq.allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index 3a3917784b..32ff813f43 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -58,6 +58,19 @@ void CLLSTMLayer::configure(const ICLTensor *input, const ICLTensor *output_state_in, const ICLTensor *cell_state_in, ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); +} + +void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, + const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, const ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, + const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, @@ -97,7 +110,7 @@ void CLLSTMLayer::configure(const ICLTensor *input, _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_gate_out2); - _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2); + _concat_inputs_forget_gate.configure(compile_context, input, output_state_in, &_forget_gate_out2); std::vector weights_vector; @@ -106,10 +119,10 @@ void CLLSTMLayer::configure(const ICLTensor *input, const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6); + _concat_weights_forget_gate.configure(compile_context, input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); @@ -121,8 +134,8 @@ void CLLSTMLayer::configure(const ICLTensor *input, _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -137,15 +150,16 @@ void CLLSTMLayer::configure(const ICLTensor *input, _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); - _mean_std_norm_forget_gate.configure(forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out); + _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -158,8 +172,8 @@ void CLLSTMLayer::configure(const ICLTensor *input, { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type())); - _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); + _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); + _subtract_input_gate.configure(compile_context, ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; } @@ -174,20 +188,20 @@ void CLLSTMLayer::configure(const ICLTensor *input, TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2); + _concat_weights_input_gate.configure(compile_context, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2); _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out3); - _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; if(_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -203,15 +217,16 @@ void CLLSTMLayer::configure(const ICLTensor *input, _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); - _mean_std_norm_input_gate.configure(input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _mean_std_norm_input_gate.configure(compile_context, input_gate_out); + _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -224,14 +239,14 @@ void CLLSTMLayer::configure(const ICLTensor *input, _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); _memory_group.manage(&_cell_state_out2); - _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2); + _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2); _memory_group.manage(&_cell_state_out3); - _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); + _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); _cell_state_out2.allocator()->allocate(); _memory_group.manage(&_cell_state_out4); - _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); + _accum_cell_state1.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); CLTensor *cell_state_out_ptr = &_cell_state_out4; if(_is_layer_norm_lstm) { @@ -239,27 +254,28 @@ void CLLSTMLayer::configure(const ICLTensor *input, _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); - _mean_std_norm_cell_gate.configure(cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr); + _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } - _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info); + _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_cell_state2.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping if(cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)); + _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)); } // Configure block that calculates the output @@ -274,12 +290,12 @@ void CLLSTMLayer::configure(const ICLTensor *input, TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2); + _concat_weights_output.configure(compile_context, input_to_output_weights, recurrent_to_output_weights, &_output2); _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); @@ -290,8 +306,8 @@ void CLLSTMLayer::configure(const ICLTensor *input, _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE); + _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -308,15 +324,16 @@ void CLLSTMLayer::configure(const ICLTensor *input, _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); - _mean_std_norm_output_gate.configure(output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _mean_std_norm_output_gate.configure(compile_context, output_gate_out); + _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -332,26 +349,26 @@ void CLLSTMLayer::configure(const ICLTensor *input, _output_state1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_activation); - _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info); + _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); _cell_state_activation.allocator()->allocate(); if(lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping if(projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); } } // Copy cell state and output - _copy_cell_state.configure(&_cell_state_out1, cell_state_out); - _copy_output.configure(output_state_out, output); + _copy_cell_state.configure(compile_context, &_cell_state_out1, cell_state_out); + _copy_output.configure(compile_context, output_state_out, output); // Vector for holding the tensors to store in scratch buffer std::vector scratch_inputs; @@ -362,7 +379,7 @@ void CLLSTMLayer::configure(const ICLTensor *input, scratch_inputs.emplace_back(&_cell_state_out1); scratch_inputs.emplace_back(forget_gate_out); scratch_inputs.emplace_back(output_gate_out); - _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX); + _concat_scratch_buffer.configure(compile_context, scratch_inputs, scratch_buffer, Window::DimX); input_gate_out->allocator()->allocate(); _cell_state_out1.allocator()->allocate(); forget_gate_out->allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index e5f127825b..c57fcc9f21 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -61,6 +61,18 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, ICLTensor *cell_state_in, const ICLTensor *output_state_in, ICLTensor *cell_state_out, ICLTensor *output_state_out) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); +} + +void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input, + const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, const ICLTensor *output_state_in, + ICLTensor *cell_state_out, ICLTensor *output_state_out) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, @@ -107,18 +119,18 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, recurrent_weights_vector.emplace_back(recurrent_to_output_weights); _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); - _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY); + _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY); _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); - _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY); + _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY); std::vector weights_vector; weights_vector.emplace_back(&_recurrent_weights); weights_vector.emplace_back(&_input_weights); _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); - _concat_weights.configure(weights_vector, &_weights, Window::DimX); - _transpose_weights.configure(&_weights, &_weights_transposed); + _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX); + _transpose_weights.configure(compile_context, &_weights, &_weights_transposed); // Input concatenation std::vector input_vector; @@ -127,7 +139,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, _memory_group.manage(&_input); _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); - _concat_inputs.configure(input_vector, &_input, Window::DimX); + _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX); // Bias concatenation std::vector bias_vector; @@ -137,7 +149,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, bias_vector.emplace_back(output_gate_bias); _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32)); - _concat_bias.configure(bias_vector, &_bias, Window::DimX); + _concat_bias.configure(compile_context, bias_vector, &_bias, Window::DimX); // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); @@ -146,7 +158,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, // Run gemmlowp _memory_group.manage(&_output_highp); _output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32)); - _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp); + _gemmlowp.configure(compile_context, &_input, &_weights_transposed, nullptr, &_output_highp); _input.allocator()->allocate(); // Set the offset back @@ -162,7 +174,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _memory_group.manage(&_output_lowp); - _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); _output_highp.allocator()->allocate(); _bias.allocator()->allocate(); @@ -170,86 +182,86 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, if(batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size }); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state_tmp1); _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_forget_gate_cell_state.configure(&_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state_tmp2); _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_input_gate_input_mod_gate.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); - _add_cell_state_tmps.configure(&_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); + _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); _cell_state_tmp1.allocator()->allocate(); _cell_state_tmp2.allocator()->allocate(); // Short term memory _memory_group.manage(&_output_state_tmp); _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul_output_state_tmp_output_gate.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); - _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32); + _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); - _quantize.configure(&_output_state_out_f32, output_state_out); + _quantize.configure(compile_context, &_output_state_out_f32, output_state_out); _output_state_out_f32.allocator()->allocate(); } diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp index c2c04e6002..831f0cdcdf 100644 --- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp +++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp @@ -47,6 +47,11 @@ CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT } void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value); +} + +void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON(nullptr == pyramid); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); @@ -67,18 +72,18 @@ void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTens _conv_pyr.init(pyramid_info); // Create Gaussian Pyramid function - _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value); + _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value); _convf.resize(_num_levels); _subf.resize(_num_levels); for(unsigned int i = 0; i < _num_levels; ++i) { - _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value); - _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP); + _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value); + _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP); } - _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0); + _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0); _gauss_pyr.allocate(); _conv_pyr.allocate(); diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp index 8d1cd98c91..ea6a3f9a98 100644 --- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp +++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp @@ -42,6 +42,11 @@ CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT } void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value); +} + +void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON(nullptr == pyramid); ARM_COMPUTE_ERROR_ON(input == output); @@ -67,17 +72,17 @@ void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *inpu const size_t last_level = num_levels - 1; - _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE); + _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE); // Scale levels n-1 to 1, and add levels n-2 to 0 for(size_t l = 0; l < last_level; ++l) { - _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value); - _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE); + _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value); + _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE); } // Convert level 0 from S16 to U8 - _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0); + _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0); _tmp_pyr.allocate(); } diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp index 3e99dde253..950be5030f 100644 --- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -128,6 +128,12 @@ Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensor } void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info); +} + +void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const PadStrideInfo &conv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info)); @@ -160,10 +166,10 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor _memory_group.manage(&_gemm_output); // Configure kernels - _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); - _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped); - _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output); - _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h)); + _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); + _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped); + _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output); + _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h)); // Allocate intermediate tensors _input_im2col_reshaped.allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp index 9ca9d167eb..a267952d4a 100644 --- a/src/runtime/CL/functions/CLMagnitude.cpp +++ b/src/runtime/CL/functions/CLMagnitude.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type); +} + +void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output, nullptr, mag_type); + k->configure(compile_context, input1, input2, output, nullptr, mag_type); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp index 8517b59e7a..e3ce704bfb 100644 --- a/src/runtime/CL/functions/CLMeanStdDev.cpp +++ b/src/runtime/CL/functions/CLMeanStdDev.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -65,6 +65,11 @@ Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev) } void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev); +} + +void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev) { // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev _data_type = input->info()->data_type(); @@ -74,14 +79,14 @@ void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev) _num_pixels = input->info()->dimension(0) * input->info()->dimension(1); _memory_group.manage(&_reduction_output_mean); - _reduction_operation_mean.configure(input, &_reduction_output_mean, 0, ReductionOperation::SUM); + _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM); _reduction_output_mean.allocator()->allocate(); _mean = mean; if(stddev != nullptr) { _memory_group.manage(&_reduction_output_stddev); - _reduction_operation_stddev.configure(input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE); + _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE); _reduction_output_stddev.allocator()->allocate(); _stddev = stddev; _run_stddev = true; @@ -96,8 +101,8 @@ void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev) _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong)); } - _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared); - _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0))); + _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared); + _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0))); } } diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp index 9b5e707665..3dbab76c72 100644 --- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp @@ -30,9 +30,14 @@ namespace arm_compute { void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float epsilon) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); +} + +void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, epsilon); + k->configure(compile_context, input, output, epsilon); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp index 7c5c422822..dc53240f79 100644 --- a/src/runtime/CL/functions/CLMedian3x3.cpp +++ b/src/runtime/CL/functions/CLMedian3x3.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); +} + +void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp index 49dcbcb7df..15b28330b5 100644 --- a/src/runtime/CL/functions/CLMinMaxLocation.cpp +++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,6 +42,13 @@ CLMinMaxLocation::CLMinMaxLocation() } void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count); +} + +void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, + uint32_t *min_count, + uint32_t *max_count) { ARM_COMPUTE_ERROR_ON(nullptr == min); ARM_COMPUTE_ERROR_ON(nullptr == max); @@ -55,8 +62,8 @@ void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CL _min_loc = min_loc; _max_loc = max_loc; - _min_max_kernel.configure(input, &_min_max_vals); - _min_max_loc_kernel.configure(input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc); + _min_max_kernel.configure(compile_context, input, &_min_max_vals); + _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc); } void CLMinMaxLocation::run() diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp index 843a756fea..96912a21cd 100644 --- a/src/runtime/CL/functions/CLNonLinearFilter.cpp +++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp @@ -32,9 +32,15 @@ using namespace arm_compute; void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value); +} + +void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, + const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp index 2f9c02dbb6..6d4a28db26 100644 --- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp +++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp @@ -31,17 +31,22 @@ using namespace arm_compute; void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode); +} + +void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); if(border_mode != BorderMode::UNDEFINED) { - _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT); + _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT); } else { - _border_handler.configure(input, _kernel->border_size(), BorderMode::UNDEFINED); + _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED); } } diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index 8489fab68b..f59a4ca959 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -38,14 +38,19 @@ CLNormalizationLayer::CLNormalizationLayer() } void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); +} + +void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); // Configure normalization kernel - _norm_kernel.configure(input, output, norm_info); + _norm_kernel.configure(compile_context, input, output, norm_info); // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue()); } Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp index 33b993a003..b03de6475b 100644 --- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp @@ -32,9 +32,14 @@ namespace arm_compute { void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); +} + +void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, mean, std); + k->configure(compile_context, input, output, mean, std); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp index 5e49937b5f..5f7c1704ee 100644 --- a/src/runtime/CL/functions/CLOpticalFlow.cpp +++ b/src/runtime/CL/functions/CLOpticalFlow.cpp @@ -61,6 +61,15 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension, + use_initial_estimate, border_mode, constant_border_value); +} + +void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid, + const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, + Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, + BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid); ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid); @@ -122,18 +131,18 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new _memory_group.manage(&_scharr_gy[i]); // Init Scharr kernel - _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value); + _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value); // Init Lucas-Kanade init kernel - _tracker_init_kernel[i].configure(old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale); + _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale); // Init Lucas-Kanade stage0 kernel - _tracker_stage0_kernel[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], + _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), window_dimension, i); // Init Lucas-Kanade stage1 kernel - _tracker_stage1_kernel[i].configure(new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), + _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), termination, epsilon, num_iterations, window_dimension, i); // Allocate intermediate buffers @@ -142,7 +151,7 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new } // Finalize Lucas-Kanade - _tracker_finalize_kernel.configure(_new_points_internal.get(), new_points); + _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points); } void CLOpticalFlow::run() diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index ab4f53d960..6543ab922e 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -31,7 +31,7 @@ namespace arm_compute { namespace { -void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output) +void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output) { if(output->info()->dimension(0) > 1) { @@ -39,18 +39,23 @@ void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize bor if(broadcasted_info->info()->dimension(0) == 1) { - border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE); + border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE); } } } } // namespace void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output); +} + +void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(ArithmeticOperation::PRELU, input, alpha, output); + k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output); _kernel = std::move(k); - configure_border_handler(_border_handler, _kernel->border_size(), input, alpha, output); + configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input, alpha, output); } Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp index 8f36a69866..078bdbc51f 100644 --- a/src/runtime/CL/functions/CLPadLayer.cpp +++ b/src/runtime/CL/functions/CLPadLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -31,6 +31,11 @@ CLPadLayer::CLPadLayer() } void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); +} + +void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); @@ -41,12 +46,12 @@ void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingLis if(_perform_pad) { - _pad_kernel.configure(input, output, padding, constant_value, mode); + _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode); } else { // Copy the input to the whole output if no padding is applied - _copy_kernel.configure(input, output); + _copy_kernel.configure(compile_context, input, output); } } Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp index 6b88ef86ac..e6323ce504 100644 --- a/src/runtime/CL/functions/CLPermute.cpp +++ b/src/runtime/CL/functions/CLPermute.cpp @@ -31,9 +31,14 @@ namespace arm_compute { void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, perm); +} + +void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, perm); + k->configure(compile_context, input, output, perm); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp index 537dda02f4..b915104f38 100644 --- a/src/runtime/CL/functions/CLPhase.cpp +++ b/src/runtime/CL/functions/CLPhase.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type); +} + +void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type); + k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index b527922d2b..3c1a7de76d 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -33,9 +33,15 @@ namespace arm_compute { void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info); +} + +void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + k->configure(compile_context, input1, input2, output, scale, overflow_policy, rounding_policy, act_info); _kernel = std::move(k); if(output->info()->dimension(0) > 1) @@ -44,7 +50,7 @@ void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, if(broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } @@ -56,9 +62,14 @@ Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen } void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); +} + +void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output, act_info); + k->configure(compile_context, input1, input2, output, act_info); _kernel = std::move(k); if(output->info()->dimension(0) > 1) @@ -67,7 +78,7 @@ void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *i if(broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index 9c4fa4a2ba..e7735b00df 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -31,12 +31,17 @@ namespace arm_compute { void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices); +} + +void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); // Configure pooling kernel auto k = arm_compute::support::cpp14::make_unique(); k->set_target(CLScheduler::get().target()); - k->configure(input, output, pool_info, indices); + k->configure(compile_context, input, output, pool_info, indices); _kernel = std::move(k); const DataType data_type = input->info()->data_type(); @@ -74,7 +79,7 @@ void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const Poolin default: ARM_COMPUTE_ERROR("Data layout not supported"); } - _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value); // Tune kernels CLScheduler::get().tune_kernel_static(*_kernel); diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp index 4f6c969a92..d01b4c711b 100644 --- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp +++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,6 +39,11 @@ CLPriorBoxLayer::CLPriorBoxLayer() } void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info); +} + +void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) { _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float)); _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float)); @@ -48,7 +53,7 @@ void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2 } auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input1, input2, output, info, &_min, &_max, &_aspect_ratios); + k->configure(compile_context, input1, input2, output, info, &_min, &_max, &_aspect_ratios); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index 4b994d47b2..88c5f77b9f 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -51,7 +51,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr memory_manager) _memory_group = MemoryGroup(std::move(memory_manager)); } -void CLQLSTMLayer::configure_mm(CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, +void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) @@ -63,11 +63,11 @@ void CLQLSTMLayer::configure_mm(CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutp outstage_res->allocator()->init(outstage_tensor_info); // Configure matrix-multiplication - mm.configure(mm_input, mm_weights, nullptr, mm_res); + mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res); // Configure output stage quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - outstage.configure(mm_res, bias, outstage_res, gemmlowp_info); + outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } @@ -78,6 +78,19 @@ void CLQLSTMLayer::configure(const ICLTensor *input, const ICLTensor *cell_state_in, const ICLTensor *output_state_in, ICLTensor *cell_state_out, ICLTensor *output_state_out, const LSTMParams &lstm_params) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, lstm_params); +} + +void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, + const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + const ICLTensor *cell_state_in, const ICLTensor *output_state_in, + ICLTensor *cell_state_out, ICLTensor *output_state_out, + const LSTMParams &lstm_params) { ARM_COMPUTE_UNUSED(forget_gate_bias); ARM_COMPUTE_UNUSED(cell_bias); @@ -133,36 +146,36 @@ void CLQLSTMLayer::configure(const ICLTensor *input, _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction.configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction.configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction.configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction.configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction.configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction.configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction.configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction.configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); if(_projection_bias != nullptr) { - _projection_reduction.configure(_projection_weights, &_projection_reduction_res, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(), true)); - _projection_bias_add.configure(ArithmeticOperation::ADD, _projection_bias, &_projection_reduction_res, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_reduction.configure(compile_context, _projection_weights, &_projection_reduction_res, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(), true)); + _projection_bias_add.configure(compile_context, ArithmeticOperation::ADD, _projection_bias, &_projection_reduction_res, &_projection_eff_bias, ConvertPolicy::SATURATE); } // Pre-transpose weights to be used in GEMM. - _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed); - _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed); - _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); - _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); + _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed); + _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed); + _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); + _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); if(!_has_cifg) { - _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); } if(_has_projection) { - _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed); + _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -175,31 +188,33 @@ void CLQLSTMLayer::configure(const ICLTensor *input, // Forget gate. const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); - _accumulate_input_recurrent_forget.configure(ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); if(_has_peephole) { _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } @@ -209,30 +224,31 @@ void CLQLSTMLayer::configure(const ICLTensor *input, const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(&_recurrent_to_forget_outstage_res, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(compile_context, &_recurrent_to_forget_outstage_res, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _recurrent_to_forget_outstage_res.allocator()->allocate(); // Modulation gate. const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); - _accumulate_input_recurrent_modulation.configure(ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(&_recurrent_to_cell_outstage_res, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(compile_context, &_recurrent_to_cell_outstage_res, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); _recurrent_to_cell_outstage_res.allocator()->allocate(); // Input gate. @@ -242,75 +258,77 @@ void CLQLSTMLayer::configure(const ICLTensor *input, if(_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); - _input_gate_sub.configure(ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); + _input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); _ones.allocator()->allocate(); } else { const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, input, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, + ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); if(_has_peephole) { _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); _accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } - _input_gate_tanh.configure(&_recurrent_to_input_outstage_res, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _input_gate_tanh.configure(compile_context, &_recurrent_to_input_outstage_res, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); _recurrent_to_input_outstage_res.allocator()->allocate(); } // Cell. // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel - _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); - _add_forget_cell.configure(ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); + _add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); if(_has_cell_clipping) { - _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); } // Output gate. const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); - _accumulate_input_recurrent_output.configure(ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, + ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); if(_has_peephole) @@ -320,31 +338,32 @@ void CLQLSTMLayer::configure(const ICLTensor *input, // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accumulate_cell_to_output.configure(ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_mul_cell_to_output_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_mul_cell_to_output_res, &_recurrent_to_output_outstage_res, + ConvertPolicy::SATURATE); _mul_cell_to_output_res.allocator()->allocate(); } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(&_recurrent_to_output_outstage_res, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(compile_context, &_recurrent_to_output_outstage_res, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _recurrent_to_output_outstage_res.allocator()->allocate(); // Hidden. - _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); - _hidden_outstage.configure(&_hidden_mul_res, nullptr, output_state_out, gemmlowp_info); + _hidden_outstage.configure(compile_context, &_hidden_mul_res, nullptr, output_state_out, gemmlowp_info); _hidden_mul_res.allocator()->allocate(); // Projection. @@ -358,12 +377,12 @@ void CLQLSTMLayer::configure(const ICLTensor *input, gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, + configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, output_state_out, &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, &_projection_outstage_res, projection_scale, mm_out_info, projection_outstage_info); - _accumulate_projection.configure(ArithmeticOperation::ADD, &_projection_outstage_res, output_state_out, output_state_out, ConvertPolicy::SATURATE); + _accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, output_state_out, output_state_out, ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); int8_t quantized_projection_clip{ 0 }; @@ -374,7 +393,8 @@ void CLQLSTMLayer::configure(const ICLTensor *input, if(quantized_projection_clip > 0) { - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip)); + _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, + quantized_projection_clip)); _has_projection_clipping = true; } } diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp index cc78ccede3..6239f279ea 100644 --- a/src/runtime/CL/functions/CLQuantizationLayer.cpp +++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp @@ -29,9 +29,14 @@ namespace arm_compute { void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLQuantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index e839a6ba21..57b8d70089 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -67,6 +67,13 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info); +} + +void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); @@ -81,23 +88,23 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con // Manage intermediate buffers and configure _memory_group.manage(&_fully_connected_out); - _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); + _fully_connected_kernel.configure(compile_context, input, weights, bias, &_fully_connected_out); _memory_group.manage(&_gemm_output); - _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); + _gemm_state_f.configure(compile_context, hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); _memory_group.manage(&_add_output); - _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); + _add_kernel.configure(compile_context, ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); _fully_connected_out.allocator()->allocate(); _gemm_output.allocator()->allocate(); - _activation_kernel.configure(&_add_output, hidden_state, info); + _activation_kernel.configure(compile_context, &_add_output, hidden_state, info); _add_output.allocator()->allocate(); - _copy_kernel.configure(hidden_state, output); + _copy_kernel.configure(compile_context, hidden_state, output); } void CLRNNLayer::run() diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp index f0044e20a0..43b58ddb9b 100644 --- a/src/runtime/CL/functions/CLROIAlignLayer.cpp +++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp @@ -37,10 +37,15 @@ Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *ro } void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); +} + +void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) { // Configure ROI pooling kernel auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, rois, output, pool_info); + k->configure(compile_context, input, rois, output, pool_info); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp index 6780907155..bb54cfa2ca 100644 --- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp @@ -31,9 +31,14 @@ using namespace arm_compute; void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); +} + +void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) { // Configure ROI pooling kernel auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, rois, output, pool_info); + k->configure(compile_context, input, rois, output, pool_info); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp index 5dedcc081c..b29b03d5b5 100644 --- a/src/runtime/CL/functions/CLRange.cpp +++ b/src/runtime/CL/functions/CLRange.cpp @@ -33,10 +33,15 @@ using namespace arm_compute; void CLRange::configure(ICLTensor *output, const float start, const float end, const float step) +{ + configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); +} + +void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { auto k = arm_compute::support::cpp14::make_unique(); k->set_target(CLScheduler::get().target()); - k->configure(output, start, end, step); + k->configure(compile_context, output, start, end, step); _kernel = std::move(k); // Tune kernels diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index 3b7a7f873a..3ddaa00d4b 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -92,6 +92,11 @@ CLReduceMean::CLReduceMean(std::shared_ptr memory_manager) { } void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output); +} + +void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); @@ -118,13 +123,13 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis if(i == _reduction_ops - 1 && keep_dims) { - _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, output, axis_local[i], ReductionOperation::MEAN_SUM); } else { _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); - _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); } } @@ -147,7 +152,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis out_shape.remove_dimension(axis_local[i] - i); } auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); - _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], output); } } diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index 77168998f8..b659ecfaf6 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -190,6 +190,11 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor } void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims); +} + +void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _op = op; @@ -218,7 +223,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign _memory_group.manage(&_results_vector.back()); } - _reduction_kernels_vector[0].configure(input, output_internal, axis, op, 0); + _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0); } else { @@ -318,15 +323,15 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign ARM_COMPUTE_ERROR("Not supported"); } - _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op); - _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue); + _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op); + _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue); // Apply ReductionOperation on intermediate stages for(unsigned int i = 1; i < _num_of_stages - 1; ++i) { _memory_group.manage(&_results_vector[i]); - _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op); - _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue); + _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op); + _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue); _results_vector[i - 1].allocator()->allocate(); } @@ -339,14 +344,14 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign _memory_group.manage(&_results_vector.back()); } - _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width); - _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue); + _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width); + _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue); _results_vector[last_stage - 1].allocator()->allocate(); } if(_is_reshape_required) { - _reshape_kernel.configure(&_results_vector.back(), output); + _reshape_kernel.configure(compile_context, &_results_vector.back(), output); _results_vector.back().allocator()->allocate(); } } diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp index 9e4529b1dc..af241ec299 100644 --- a/src/runtime/CL/functions/CLRemap.cpp +++ b/src/runtime/CL/functions/CLRemap.cpp @@ -36,6 +36,13 @@ using namespace arm_compute; void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value); +} + +void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, + BorderMode border_mode, + uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); @@ -44,7 +51,7 @@ void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTenso ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported"); auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp index 547d0fc483..ea9331414c 100644 --- a/src/runtime/CL/functions/CLReorgLayer.cpp +++ b/src/runtime/CL/functions/CLReorgLayer.cpp @@ -35,9 +35,14 @@ using namespace arm_compute; void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); +} + +void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, stride); + k->configure(compile_context, input, output, stride); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp index 3325d2c234..13baedb3f9 100644 --- a/src/runtime/CL/functions/CLReshapeLayer.cpp +++ b/src/runtime/CL/functions/CLReshapeLayer.cpp @@ -31,9 +31,14 @@ using namespace arm_compute; void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLReshapeLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp index 3f7a2708bb..3c8bc15a54 100644 --- a/src/runtime/CL/functions/CLReverse.cpp +++ b/src/runtime/CL/functions/CLReverse.cpp @@ -30,9 +30,14 @@ namespace arm_compute { void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); +} + +void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, axis); + k->configure(compile_context, input, output, axis); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index 9ec8c44836..a9395bdc3d 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -34,11 +34,17 @@ using namespace arm_compute; void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners); +} + +void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, + SamplingPolicy sampling_policy, bool use_padding, bool align_corners) { ARM_COMPUTE_UNUSED(use_padding); auto k = arm_compute::support::cpp14::make_unique(); k->set_target(CLScheduler::get().target()); - k->configure(input, output, policy, border_mode, sampling_policy, align_corners); + k->configure(compile_context, input, output, policy, border_mode, sampling_policy, align_corners); _kernel = std::move(k); // Tune kernels @@ -50,7 +56,7 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy { border_mode = BorderMode::CONSTANT; } - _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, constant_border_value); } Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp index df96ce83ba..faad5424a2 100644 --- a/src/runtime/CL/functions/CLScharr3x3.cpp +++ b/src/runtime/CL/functions/CLScharr3x3.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); +} + +void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp index 90c368e9b8..7187010448 100644 --- a/src/runtime/CL/functions/CLSelect.cpp +++ b/src/runtime/CL/functions/CLSelect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,9 +32,14 @@ using namespace arm_compute; namespace arm_compute { void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output); +} + +void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(c, x, y, output); + k->configure(compile_context, c, x, y, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp index d8e8e7e140..e8cc0f5499 100644 --- a/src/runtime/CL/functions/CLSlice.cpp +++ b/src/runtime/CL/functions/CLSlice.cpp @@ -32,6 +32,11 @@ namespace arm_compute { void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends); +} + +void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -39,7 +44,7 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0); + k->configure(compile_context, input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp index e362ebf8ba..c3604f970f 100644 --- a/src/runtime/CL/functions/CLSobel3x3.cpp +++ b/src/runtime/CL/functions/CLSobel3x3.cpp @@ -32,9 +32,14 @@ using namespace arm_compute; void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); +} + +void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED); + k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp index 22fbef17eb..f8a33f3fb6 100644 --- a/src/runtime/CL/functions/CLSobel5x5.cpp +++ b/src/runtime/CL/functions/CLSobel5x5.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,6 +39,11 @@ CLSobel5x5::CLSobel5x5(std::shared_ptr memory_manager) } void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); +} + +void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); @@ -53,8 +58,8 @@ void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); _tmp_y.allocator()->allocate(); } @@ -62,19 +67,19 @@ void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out { _tmp_x.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); - _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); } else if(run_sobel_y) { _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); _tmp_y.allocator()->allocate(); } - _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); } void CLSobel5x5::run() diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp index 9b38f6928f..6d3c7f0d08 100644 --- a/src/runtime/CL/functions/CLSobel7x7.cpp +++ b/src/runtime/CL/functions/CLSobel7x7.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,6 +39,11 @@ CLSobel7x7::CLSobel7x7(std::shared_ptr memory_manager) } void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); +} + +void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); @@ -53,8 +58,8 @@ void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); _tmp_y.allocator()->allocate(); } @@ -62,19 +67,19 @@ void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out { _tmp_x.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); - _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); } else if(run_sobel_y) { _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); _tmp_y.allocator()->allocate(); } - _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); } void CLSobel7x7::run() diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp index e01d2c75ca..b0b2117cd9 100644 --- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp +++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,6 +43,12 @@ CLSoftmaxLayerGeneric::CLSoftmaxLayerGeneric(std::shared_ptr void CLSoftmaxLayerGeneric::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis) +{ + configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, axis); +} + +template +void CLSoftmaxLayerGeneric::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis) { // Flatten the input const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis); @@ -56,13 +62,13 @@ void CLSoftmaxLayerGeneric::configure_reshape_input_kernel(const ICLTens if(axis != 3) { auto reshape_kernel_ptr = support::cpp14::make_unique(); - reshape_kernel_ptr->configure(input, &_input_flattened); + reshape_kernel_ptr->configure(compile_context, input, &_input_flattened); _flatten_kernel_ptr = std::move(reshape_kernel_ptr); } else { auto flatten_kernel_ptr = support::cpp14::make_unique(); - flatten_kernel_ptr->configure(input, &_input_flattened); + flatten_kernel_ptr->configure(compile_context, input, &_input_flattened); _flatten_kernel_ptr = std::move(flatten_kernel_ptr); } @@ -73,6 +79,12 @@ void CLSoftmaxLayerGeneric::configure_reshape_input_kernel(const ICLTens template void CLSoftmaxLayerGeneric::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, beta, axis); +} + +template +void CLSoftmaxLayerGeneric::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, size_t axis) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -123,7 +135,7 @@ void CLSoftmaxLayerGeneric::configure(const ICLTensor *input, ICLTensor softmax_info.input_data_type = input_2D->info()->data_type(); // Configure kernels - _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, softmax_info); + _max_shift_exp_sum_kernel.configure(compile_context, input_2D, &_max, &_tmp, &_sum, softmax_info); if(_needs_flattening) { @@ -131,10 +143,10 @@ void CLSoftmaxLayerGeneric::configure(const ICLTensor *input, ICLTensor _memory_group.manage(&_output_flattened); // The normalization kernel stores the result in a flat output tensor - _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, softmax_info); + _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info); // Reshape the flat output into a the requested (4D) output - _reshape_kernel.configure(&_output_flattened, output); + _reshape_kernel.configure(compile_context, &_output_flattened, output); // Allocate the intermediate flat tensors _input_flattened.allocator()->allocate(); @@ -143,7 +155,7 @@ void CLSoftmaxLayerGeneric::configure(const ICLTensor *input, ICLTensor else { // Softmax 2D case - _norm_kernel.configure(&_tmp, &_sum, output, softmax_info); + _norm_kernel.configure(compile_context, &_tmp, &_sum, output, softmax_info); } // Allocate intermediate buffers @@ -203,7 +215,7 @@ Status CLSoftmaxLayerGeneric::validate(const ITensorInfo *input, const I } template -void CLSoftmaxLayerGeneric::run() +void CLSoftmaxLayerGeneric::run() { MemoryGroupResourceScope scope_mg(_memory_group); diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp index fa6e82efb0..021d31649d 100644 --- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -38,27 +38,38 @@ CLSpaceToBatchLayer::CLSpaceToBatchLayer() } void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); +} + +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(input, block_shape, paddings, output); + _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output); } void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); +} + +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, + const Size2D &padding_right, ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); } Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp index f02a13f66d..a4ffefc189 100644 --- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,7 +39,12 @@ CLSpaceToDepthLayer::CLSpaceToDepthLayer() void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape) { - _space_to_depth_kernel.configure(input, output, block_shape); + configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); +} + +void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +{ + _space_to_depth_kernel.configure(compile_context, input, output, block_shape); } Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp index 607445af4a..79c3fe5371 100644 --- a/src/runtime/CL/functions/CLStackLayer.cpp +++ b/src/runtime/CL/functions/CLStackLayer.cpp @@ -43,6 +43,11 @@ CLStackLayer::CLStackLayer() // NOLINT } void CLStackLayer::configure(const std::vector &input, int axis, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, axis, output); +} + +void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector &input, int axis, ICLTensor *output) { _num_inputs = input.size(); _stack_kernels.resize(_num_inputs); @@ -52,7 +57,7 @@ void CLStackLayer::configure(const std::vector &input, int axis, IC for(unsigned int i = 0; i < _num_inputs; i++) { - _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output); + _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output); } } diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp index e217906870..454759664c 100644 --- a/src/runtime/CL/functions/CLStridedSlice.cpp +++ b/src/runtime/CL/functions/CLStridedSlice.cpp @@ -32,9 +32,16 @@ namespace arm_compute void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); +} + +void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, + const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, + int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp index 35f939f873..47e15d3c12 100644 --- a/src/runtime/CL/functions/CLTableLookup.cpp +++ b/src/runtime/CL/functions/CLTableLookup.cpp @@ -31,8 +31,13 @@ using namespace arm_compute; void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, lut, output); +} + +void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, lut, output); + k->configure(compile_context, input, lut, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp index a655783498..57c92724fa 100644 --- a/src/runtime/CL/functions/CLThreshold.cpp +++ b/src/runtime/CL/functions/CLThreshold.cpp @@ -31,8 +31,14 @@ using namespace arm_compute; void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper); +} + +void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, + uint8_t upper) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, threshold, false_value, true_value, type, upper); + k->configure(compile_context, input, output, threshold, false_value, true_value, type, upper); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp index e45d88d867..178d7af95e 100644 --- a/src/runtime/CL/functions/CLTile.cpp +++ b/src/runtime/CL/functions/CLTile.cpp @@ -29,9 +29,14 @@ namespace arm_compute { void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); +} + +void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, multiples); + k->configure(compile_context, input, output, multiples); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp index aa912303db..f5121d06a5 100644 --- a/src/runtime/CL/functions/CLTranspose.cpp +++ b/src/runtime/CL/functions/CLTranspose.cpp @@ -31,9 +31,14 @@ using namespace arm_compute; void CLTranspose::configure(const ICLTensor *input, ICLTensor *output) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output); +} + +void CLTranspose::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output); + k->configure(compile_context, input, output); _kernel = std::move(k); } diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp index eb1dd8cd44..032fb993d0 100644 --- a/src/runtime/CL/functions/CLUnstack.cpp +++ b/src/runtime/CL/functions/CLUnstack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -60,6 +60,11 @@ CLUnstack::CLUnstack() // NOLINT } void CLUnstack::configure(const ICLTensor *input, const std::vector &output_vector, int axis) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis); +} + +void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector &output_vector, int axis) { std::vector outputs_vector_info(output_vector.size()); std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t) @@ -83,7 +88,7 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); } } diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp index 1dad3250a2..dd04686d60 100644 --- a/src/runtime/CL/functions/CLUpsampleLayer.cpp +++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,11 +43,17 @@ Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *ou void CLUpsampleLayer::configure(ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, info, upsampling_policy); +} + +void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, + const Size2D &info, const InterpolationPolicy upsampling_policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _output = output; - _upsample.configure(input, _output, info, upsampling_policy); + _upsample.configure(compile_context, input, _output, info, upsampling_policy); } void CLUpsampleLayer::run() diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp index 08c22dba2b..ce2171b3d4 100644 --- a/src/runtime/CL/functions/CLWarpAffine.cpp +++ b/src/runtime/CL/functions/CLWarpAffine.cpp @@ -32,9 +32,15 @@ using namespace arm_compute; void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value); +} + +void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, + uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, matrix, policy); + k->configure(compile_context, input, output, matrix, policy); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp index b5bc4faba3..06c06616d0 100644 --- a/src/runtime/CL/functions/CLWarpPerspective.cpp +++ b/src/runtime/CL/functions/CLWarpPerspective.cpp @@ -32,9 +32,15 @@ using namespace arm_compute; void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value); +} + +void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, + uint8_t constant_border_value) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, matrix, policy); + k->configure(compile_context, input, output, matrix, policy); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index a5db977371..132c3ee926 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -97,6 +97,13 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptrinfo()->data_layout(), DataLayoutDimension::WIDTH); @@ -129,17 +136,18 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we // Do not manage _input1 as it contains the weights // Configure input transform - _input_transform.configure(input, &_input0, winograd_info); + _input_transform.configure(compile_context, input, &_input0, winograd_info); // Configure filter transform - _filter_transform.configure(weights, &_input1, winograd_info); + _filter_transform.configure(compile_context, weights, &_input1, winograd_info); // Configure batched matrix multiply - _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, GEMMLowpOutputStageInfo(), - (input->info()->data_type() == DataType::F16))); + _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), + (input->info()->data_type() == DataType::F16))); // Configure output transform - _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info); + _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info); // Allocate temporary tensors _input0.allocator()->allocate(); diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp index 55eccf4765..ae400768fe 100644 --- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp +++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp @@ -31,11 +31,16 @@ using namespace arm_compute; void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info); +} + +void CLWinogradInputTransform::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, winograd_info); + k->configure(compile_context, input, output, winograd_info); _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); } Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp index b2b84d9dc1..0c0c1065bc 100644 --- a/src/runtime/CL/functions/CLYOLOLayer.cpp +++ b/src/runtime/CL/functions/CLYOLOLayer.cpp @@ -30,9 +30,14 @@ using namespace arm_compute; void CLYOLOLayer::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info, num_classes); +} + +void CLYOLOLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, act_info, num_classes); + k->configure(compile_context, input, output, act_info, num_classes); _kernel = std::move(k); } diff --git a/tests/validation/fixtures/CropResizeFixture.h b/tests/validation/fixtures/CropResizeFixture.h index d83c4113f5..450c68e0e9 100644 --- a/tests/validation/fixtures/CropResizeFixture.h +++ b/tests/validation/fixtures/CropResizeFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -79,30 +79,27 @@ protected: TensorType boxes_ind = create_tensor(TensorShape(boxes_shape[1]), DataType::S32); TensorType dst = create_tensor(dst_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC); + boxes.allocator()->allocate(); + boxes_ind.allocator()->allocate(); + fill(AccessorType(boxes), 1, is_outside_bounds ? 0.0f - out_of_bounds_reach : 0.0f, is_outside_bounds ? 1.0f + out_of_bounds_reach : 1.0f); + fill(AccessorType(boxes_ind), 2, 0, static_cast(src_shape[3] - 1)); + // Create and configure function FunctionType crop; crop.configure(&src, &boxes, &boxes_ind, &dst, crop_size, method, extrapolation_value); ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(boxes.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(boxes_ind.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Allocate tensors src.allocator()->allocate(); - boxes.allocator()->allocate(); - boxes_ind.allocator()->allocate(); dst.allocator()->allocate(); ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!boxes.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!boxes_ind.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Fill tensors fill(AccessorType(src), 0); - fill(AccessorType(boxes), 1, is_outside_bounds ? 0.0f - out_of_bounds_reach : 0.0f, is_outside_bounds ? 1.0f + out_of_bounds_reach : 1.0f); - fill(AccessorType(boxes_ind), 2, 0, static_cast(src_shape[3] - 1)); // Compute function crop.run(); -- cgit v1.2.1