From bef7fa27b0d231a8649952f60808132d109b6345 Mon Sep 17 00:00:00 2001 From: Sang-Hoon Park Date: Wed, 21 Oct 2020 15:58:54 +0100 Subject: COMPMID-3639: (3RDPARTY_UPDATE) Move CL kernels to src Change-Id: I10d27db788e5086adae1841e3e2441cd9b76ef84 Signed-off-by: Sang-Hoon Park Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4310 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../runtime/CL/functions/CLAbsoluteDifference.h | 1 + arm_compute/runtime/CL/functions/CLAccumulate.h | 1 + .../runtime/CL/functions/CLActivationLayer.h | 2 + .../runtime/CL/functions/CLArgMinMaxLayer.h | 26 ++- .../CL/functions/CLBatchNormalizationLayer.h | 20 ++- .../runtime/CL/functions/CLBatchToSpaceLayer.h | 20 ++- arm_compute/runtime/CL/functions/CLBitwiseAnd.h | 1 + arm_compute/runtime/CL/functions/CLBitwiseNot.h | 1 + arm_compute/runtime/CL/functions/CLBitwiseOr.h | 1 + arm_compute/runtime/CL/functions/CLBitwiseXor.h | 1 + .../runtime/CL/functions/CLBoundingBoxTransform.h | 6 +- arm_compute/runtime/CL/functions/CLBox3x3.h | 1 + arm_compute/runtime/CL/functions/CLCannyEdge.h | 37 +++-- arm_compute/runtime/CL/functions/CLCast.h | 2 + .../runtime/CL/functions/CLChannelCombine.h | 1 + .../runtime/CL/functions/CLChannelExtract.h | 1 + .../runtime/CL/functions/CLChannelShuffleLayer.h | 3 + arm_compute/runtime/CL/functions/CLColorConvert.h | 1 + arm_compute/runtime/CL/functions/CLComparison.h | 2 + .../runtime/CL/functions/CLComputeAllAnchors.h | 5 +- .../runtime/CL/functions/CLConcatenateLayer.h | 3 +- .../CL/functions/CLConvertFullyConnectedWeights.h | 5 +- arm_compute/runtime/CL/functions/CLConvolution.h | 33 +++- .../runtime/CL/functions/CLConvolutionLayer.h | 10 ++ arm_compute/runtime/CL/functions/CLCopy.h | 2 + arm_compute/runtime/CL/functions/CLCropResize.h | 10 +- .../CL/functions/CLDeconvolutionLayerUpsample.h | 19 ++- .../runtime/CL/functions/CLDepthConvertLayer.h | 2 + .../runtime/CL/functions/CLDepthToSpaceLayer.h | 2 + .../CL/functions/CLDepthwiseConvolutionLayer.h | 59 +++---- .../runtime/CL/functions/CLDequantizationLayer.h | 2 + arm_compute/runtime/CL/functions/CLDerivative.h | 1 + arm_compute/runtime/CL/functions/CLDilate.h | 1 + .../CL/functions/CLDirectConvolutionLayer.h | 18 +- .../runtime/CL/functions/CLElementWiseUnaryLayer.h | 2 + .../runtime/CL/functions/CLElementwiseOperations.h | 2 + .../runtime/CL/functions/CLEqualizeHistogram.h | 29 ++-- arm_compute/runtime/CL/functions/CLErode.h | 1 + arm_compute/runtime/CL/functions/CLFFT1D.h | 28 ++-- arm_compute/runtime/CL/functions/CLFFT2D.h | 6 + arm_compute/runtime/CL/functions/CLFastCorners.h | 29 ++-- arm_compute/runtime/CL/functions/CLFill.h | 1 + arm_compute/runtime/CL/functions/CLFillBorder.h | 1 + arm_compute/runtime/CL/functions/CLFlattenLayer.h | 2 + arm_compute/runtime/CL/functions/CLFloor.h | 2 + .../runtime/CL/functions/CLFullyConnectedLayer.h | 1 - .../CL/functions/CLFuseBatchNormalization.h | 10 +- arm_compute/runtime/CL/functions/CLGEMM.h | 102 ++++++------ .../runtime/CL/functions/CLGEMMConvolutionLayer.h | 25 ++- .../CL/functions/CLGEMMDeconvolutionLayer.h | 22 +-- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h | 35 ++-- .../runtime/CL/functions/CLGEMMLowpOutputStage.h | 7 + arm_compute/runtime/CL/functions/CLGather.h | 3 + arm_compute/runtime/CL/functions/CLGaussian3x3.h | 1 + arm_compute/runtime/CL/functions/CLGaussian5x5.h | 26 ++- .../runtime/CL/functions/CLGaussianPyramid.h | 28 ++-- .../CL/functions/CLGenerateProposalsLayer.h | 39 +++-- arm_compute/runtime/CL/functions/CLHOGDescriptor.h | 23 ++- arm_compute/runtime/CL/functions/CLHOGDetector.h | 19 ++- arm_compute/runtime/CL/functions/CLHOGGradient.h | 16 +- .../runtime/CL/functions/CLHOGMultiDetection.h | 36 ++-- arm_compute/runtime/CL/functions/CLHarrisCorners.h | 40 ++--- arm_compute/runtime/CL/functions/CLHistogram.h | 2 +- .../CL/functions/CLInstanceNormalizationLayer.h | 3 + arm_compute/runtime/CL/functions/CLIntegralImage.h | 16 +- .../runtime/CL/functions/CLL2NormalizeLayer.h | 22 ++- arm_compute/runtime/CL/functions/CLLSTMLayer.h | 184 +++++++++++---------- .../runtime/CL/functions/CLLocallyConnectedLayer.h | 30 ++-- arm_compute/runtime/CL/functions/CLMagnitude.h | 1 + .../runtime/CL/functions/CLMaxUnpoolingLayer.h | 21 ++- arm_compute/runtime/CL/functions/CLMeanStdDev.h | 39 +++-- .../CL/functions/CLMeanStdDevNormalizationLayer.h | 2 + arm_compute/runtime/CL/functions/CLMedian3x3.h | 1 + .../runtime/CL/functions/CLMinMaxLocation.h | 28 ++-- .../runtime/CL/functions/CLNonLinearFilter.h | 1 + .../CL/functions/CLNonMaximaSuppression3x3.h | 1 + .../runtime/CL/functions/CLNormalizationLayer.h | 27 ++- .../CL/functions/CLNormalizePlanarYUVLayer.h | 2 + arm_compute/runtime/CL/functions/CLOpticalFlow.h | 41 +++-- arm_compute/runtime/CL/functions/CLPReluLayer.h | 3 +- arm_compute/runtime/CL/functions/CLPadLayer.h | 14 +- arm_compute/runtime/CL/functions/CLPermute.h | 2 + arm_compute/runtime/CL/functions/CLPhase.h | 1 + .../CL/functions/CLPixelWiseMultiplication.h | 8 +- arm_compute/runtime/CL/functions/CLPoolingLayer.h | 2 + arm_compute/runtime/CL/functions/CLPriorBoxLayer.h | 5 +- arm_compute/runtime/CL/functions/CLQLSTMLayer.h | 167 +++++++++---------- .../runtime/CL/functions/CLQuantizationLayer.h | 3 + arm_compute/runtime/CL/functions/CLRNNLayer.h | 30 ++-- arm_compute/runtime/CL/functions/CLROIAlignLayer.h | 4 +- .../runtime/CL/functions/CLROIPoolingLayer.h | 6 +- arm_compute/runtime/CL/functions/CLRange.h | 2 + .../runtime/CL/functions/CLReductionOperation.h | 33 ++-- arm_compute/runtime/CL/functions/CLRemap.h | 1 + arm_compute/runtime/CL/functions/CLReorgLayer.h | 2 + arm_compute/runtime/CL/functions/CLReshapeLayer.h | 2 + arm_compute/runtime/CL/functions/CLReverse.h | 3 + arm_compute/runtime/CL/functions/CLScale.h | 2 + arm_compute/runtime/CL/functions/CLScharr3x3.h | 1 + arm_compute/runtime/CL/functions/CLSelect.h | 5 +- arm_compute/runtime/CL/functions/CLSlice.h | 2 + arm_compute/runtime/CL/functions/CLSobel3x3.h | 9 + arm_compute/runtime/CL/functions/CLSobel5x5.h | 24 ++- arm_compute/runtime/CL/functions/CLSobel7x7.h | 24 ++- arm_compute/runtime/CL/functions/CLSoftmaxLayer.h | 37 +++-- .../runtime/CL/functions/CLSpaceToBatchLayer.h | 19 ++- .../runtime/CL/functions/CLSpaceToDepthLayer.h | 19 ++- arm_compute/runtime/CL/functions/CLStackLayer.h | 21 ++- arm_compute/runtime/CL/functions/CLTableLookup.h | 1 + arm_compute/runtime/CL/functions/CLThreshold.h | 1 + arm_compute/runtime/CL/functions/CLTile.h | 5 +- arm_compute/runtime/CL/functions/CLTranspose.h | 3 + arm_compute/runtime/CL/functions/CLUpsampleLayer.h | 12 +- arm_compute/runtime/CL/functions/CLWarpAffine.h | 1 + .../runtime/CL/functions/CLWarpPerspective.h | 1 + .../CL/functions/CLWinogradConvolutionLayer.h | 28 ++-- .../CL/functions/CLWinogradInputTransform.h | 2 + arm_compute/runtime/CL/functions/CLYOLOLayer.h | 5 +- 118 files changed, 1106 insertions(+), 661 deletions(-) (limited to 'arm_compute/runtime/CL/functions') diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h index b0f1948beb..f2831e2a99 100644 --- a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h +++ b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLAbsoluteDifferenceKernel diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h index 9dbf13b873..20d3476d2e 100644 --- a/arm_compute/runtime/CL/functions/CLAccumulate.h +++ b/arm_compute/runtime/CL/functions/CLAccumulate.h @@ -30,6 +30,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLAccumulateKernel */ diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h index 632487c78d..dc2cb62b71 100644 --- a/arm_compute/runtime/CL/functions/CLActivationLayer.h +++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLActivationLayerKernel * diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h index dc0c37e860..c254284cd7 100644 --- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h +++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h @@ -24,7 +24,6 @@ #ifndef ARM_COMPUTE_CLARGMINMAXLAYER_H #define ARM_COMPUTE_CLARGMINMAXLAYER_H -#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" @@ -36,6 +35,7 @@ namespace arm_compute { class ITensorInfo; class ICLTensor; +class CLArgMinMaxLayerKernel; /** Function to calculate the index of the minimum or maximum values in a * tensor based on an axis. @@ -53,6 +53,16 @@ public: * @param[in] memory_manager (Optional) Memory manager. */ CLArgMinMaxLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLArgMinMaxLayer(const CLArgMinMaxLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLArgMinMaxLayer &operator=(const CLArgMinMaxLayer &) = delete; + /** Prevent instances of this class to be moved */ + CLArgMinMaxLayer(CLArgMinMaxLayer &&) = delete; + /** Prevent instances of this class to be moved */ + CLArgMinMaxLayer &operator=(CLArgMinMaxLayer &&) = delete; + /** Default destructor */ + ~CLArgMinMaxLayer(); /** Set the input and output tensors. * * @param[in] input Input source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32. @@ -85,13 +95,13 @@ public: void run() override; private: - MemoryGroup _memory_group; - std::vector _results_vector; - CLTensor _not_reshaped_output; - std::vector _reduction_kernels_vector; - CLReshapeLayer _reshape; - unsigned int _num_of_stages; - unsigned int _reduction_axis; + MemoryGroup _memory_group; + std::vector _results_vector; + CLTensor _not_reshaped_output; + std::vector> _reduction_kernels_vector; + CLReshapeLayer _reshape; + unsigned int _num_of_stages; + unsigned int _reduction_axis; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLARGMINMAXLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h index c22991da7c..c8acf9fc6b 100644 --- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h @@ -26,12 +26,16 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h" #include "arm_compute/core/Types.h" +#include + namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; +class CLBatchNormalizationLayerKernel; /** Basic function to run @ref CLNormalizationLayerKernel and simulate a batch normalization layer. * @@ -44,6 +48,16 @@ class CLBatchNormalizationLayer : public IFunction public: /** Default constructor */ CLBatchNormalizationLayer(); + /** Prevent instances of this class from being copied */ + CLBatchNormalizationLayer(const CLBatchNormalizationLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLBatchNormalizationLayer &operator=(const CLBatchNormalizationLayer &) = delete; + /** Prevent instances of this class to be moved */ + CLBatchNormalizationLayer(CLBatchNormalizationLayer &&) = delete; + /** Prevent instances of this class to be moved */ + CLBatchNormalizationLayer &operator=(CLBatchNormalizationLayer &&) = delete; + /** Default destructor */ + ~CLBatchNormalizationLayer(); /** Set the input and output tensors. * * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place @@ -104,7 +118,7 @@ public: void run() override; private: - CLBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */ + std::unique_ptr _norm_kernel; /**< BatchNormalization layer kernel to run */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h index ba57921cc2..bdb58531d0 100644 --- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h +++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h @@ -26,11 +26,15 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h" #include "arm_compute/core/Types.h" +#include + namespace arm_compute { +class CLCompileContext; +class ITensorInfo; +class CLBatchToSpaceLayerKernel; class ICLTensor; /** Basic function to run @ref CLBatchToSpaceLayerKernel. */ @@ -39,6 +43,16 @@ class CLBatchToSpaceLayer : public IFunction public: /** Default constructor */ CLBatchToSpaceLayer(); + /** Prevent instances of this class from being copied */ + CLBatchToSpaceLayer(const CLBatchToSpaceLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLBatchToSpaceLayer &operator=(const CLBatchToSpaceLayer &) = delete; + /** Prevent instances of this class to be moved */ + CLBatchToSpaceLayer(CLBatchToSpaceLayer &&) = delete; + /** Prevent instances of this class to be moved */ + CLBatchToSpaceLayer &operator=(CLBatchToSpaceLayer &&) = delete; + /** Default destructor */ + ~CLBatchToSpaceLayer(); /** Set the input and output tensors. * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -95,7 +109,7 @@ public: void run() override; private: - CLBatchToSpaceLayerKernel _batch_to_space_kernel; /**< CLBatchToSpaceLayerKernel to run */ + std::unique_ptr _batch_to_space_kernel; /**< CLBatchToSpaceLayerKernel to run */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_CLBATCHTOSPACELAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h index 3c28938807..bf5993f4b0 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLBitwiseAndKernel. diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h index 4c21d5647f..1d8531a176 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLBitwiseNotKernel. diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h index 8a481737e3..7876cbf196 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLBitwiseOrKernel. diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h index 6928e59d38..4f054062cd 100644 --- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h +++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLBitwiseXorKernel. diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h index 5e4e89071b..d6409106da 100644 --- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h +++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h @@ -24,12 +24,16 @@ #ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSOFORM_H #define ARM_COMPUTE_CLBOUNDINGBOXTRANSOFORM_H -#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h" +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; +class CLBoundingBoxTransformKernel; +class BoundingBoxTransformInfo; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLBoundingBoxTransformKernel. * diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h index 2d2aa4705c..cff780614c 100644 --- a/arm_compute/runtime/CL/functions/CLBox3x3.h +++ b/arm_compute/runtime/CL/functions/CLBox3x3.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute box filter 3x3. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h index f9d9f8f66a..9e41c31728 100644 --- a/arm_compute/runtime/CL/functions/CLCannyEdge.h +++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h @@ -26,8 +26,6 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -36,6 +34,11 @@ namespace arm_compute { +class CLCompileContext; +class CLFillBorderKernel; +class CLGradientKernel; +class CLEdgeNonMaxSuppressionKernel; +class CLEdgeTraceKernel; class ICLTensor; /** Basic function to execute canny edge on OpenCL. This function calls the following OpenCL kernels and functions: @@ -56,6 +59,8 @@ public: CLCannyEdge(const CLCannyEdge &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ CLCannyEdge &operator=(const CLCannyEdge &) = delete; + /** Default destructor */ + ~CLCannyEdge(); /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode. * * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED) @@ -88,20 +93,20 @@ public: virtual void run() override; private: - MemoryGroup _memory_group; /**< Function's memory group */ - std::unique_ptr _sobel; /**< Pointer to Sobel kernel. */ - CLGradientKernel _gradient; /**< Gradient kernel. */ - CLFillBorderKernel _border_mag_gradient; /**< Fill border on magnitude tensor kernel */ - CLEdgeNonMaxSuppressionKernel _non_max_suppr; /**< Non-Maxima suppression kernel. */ - CLEdgeTraceKernel _edge_trace; /**< Edge tracing kernel. */ - CLImage _gx; /**< Source tensor - Gx component. */ - CLImage _gy; /**< Source tensor - Gy component. */ - CLImage _mag; /**< Source tensor - Magnitude. */ - CLImage _phase; /**< Source tensor - Phase. */ - CLImage _nonmax; /**< Source tensor - Non-Maxima suppressed. */ - CLImage _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */ - ICLTensor *_output; /**< Output tensor provided by the user. */ + MemoryGroup _memory_group; /**< Function's memory group */ + std::unique_ptr _sobel; /**< Pointer to Sobel kernel. */ + std::unique_ptr _gradient; /**< Gradient kernel. */ + std::unique_ptr _border_mag_gradient; /**< Fill border on magnitude tensor kernel */ + std::unique_ptr _non_max_suppr; /**< Non-Maxima suppression kernel. */ + std::unique_ptr _edge_trace; /**< Edge tracing kernel. */ + CLImage _gx; /**< Source tensor - Gx component. */ + CLImage _gy; /**< Source tensor - Gy component. */ + CLImage _mag; /**< Source tensor - Magnitude. */ + CLImage _phase; /**< Source tensor - Phase. */ + CLImage _nonmax; /**< Source tensor - Non-Maxima suppressed. */ + CLImage _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */ + ICLTensor *_output; /**< Output tensor provided by the user. */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_CLCANNYEDGE_H */ diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h index 592368d135..bd333d4e72 100644 --- a/arm_compute/runtime/CL/functions/CLCast.h +++ b/arm_compute/runtime/CL/functions/CLCast.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLDepthConvertLayerKernel. */ class CLCast : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h index 4e3d10cc10..5927662fc2 100644 --- a/arm_compute/runtime/CL/functions/CLChannelCombine.h +++ b/arm_compute/runtime/CL/functions/CLChannelCombine.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLMultiImage; class ICLTensor; using ICLImage = ICLTensor; diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h index cf042b4519..9ce9bcdd8a 100644 --- a/arm_compute/runtime/CL/functions/CLChannelExtract.h +++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h @@ -29,6 +29,7 @@ namespace arm_compute { +class CLCompileContext; class ICLMultiImage; class ICLTensor; using ICLImage = ICLTensor; diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h index e0bb3d01c9..54cf59f59a 100644 --- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h +++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h @@ -24,11 +24,14 @@ #ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYER_H #define ARM_COMPUTE_CLCHANNELSHUFFLELAYER_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLChannelShuffleLayerKernel * diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h index e4017c2686..47bcabfb63 100644 --- a/arm_compute/runtime/CL/functions/CLColorConvert.h +++ b/arm_compute/runtime/CL/functions/CLColorConvert.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLMultiImage; class ICLTensor; using ICLImage = ICLTensor; diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h index c6d61e45f2..8cc3e96ec5 100644 --- a/arm_compute/runtime/CL/functions/CLComparison.h +++ b/arm_compute/runtime/CL/functions/CLComparison.h @@ -30,7 +30,9 @@ namespace arm_compute { // Forward declarations +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLComparisonKernel */ class CLComparison : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h index a2f1a4eb66..d6a2ab423d 100644 --- a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h +++ b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h @@ -24,12 +24,15 @@ #ifndef ARM_COMPUTE_CLCOMPUTEALLANCHORS_H #define ARM_COMPUTE_CLCOMPUTEALLANCHORS_H -#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h" +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; +class ComputeAnchorsInfo; /** Basic function to run @ref CLComputeAllAnchorsKernel. * diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h index f535c8ea97..5e7003a112 100644 --- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h +++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h @@ -27,7 +27,6 @@ #include "arm_compute/runtime/CL/ICLOperator.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/ICLKernel.h" #include "arm_compute/core/Types.h" #include @@ -36,7 +35,9 @@ namespace arm_compute { // Forward declarations +class CLCompileContext; class ICLTensor; +class ICLKernel; class ITensorInfo; class Status; diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h index 9298be2e53..75a3d3213e 100644 --- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h +++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h @@ -24,14 +24,17 @@ #ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTS_H #define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTS_H -#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" #include "arm_compute/runtime/ITransformWeights.h" namespace arm_compute { +class CLCompileContext; +class CLConvertFullyConnectedWeightsKernel; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLConvertFullyConnectedWeightsKernel. */ class CLConvertFullyConnectedWeights : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h index c06ad0d969..4a1631a702 100644 --- a/arm_compute/runtime/CL/functions/CLConvolution.h +++ b/arm_compute/runtime/CL/functions/CLConvolution.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLCONVOLUTION_H #define ARM_COMPUTE_CLCONVOLUTION_H -#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" @@ -38,6 +36,13 @@ namespace arm_compute { +template +class CLConvolutionKernel; +template +class CLSeparableConvolutionHorKernel; +template +class CLSeparableConvolutionVertKernel; +class CLFillBorderKernel; class ICLTensor; /** Basic function to execute convolution of size 3x3. This function calls the following OpenCL kernels: @@ -85,6 +90,16 @@ class CLConvolutionSquare : public IFunction public: /** Default constructor */ CLConvolutionSquare(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLConvolutionSquare(const CLConvolutionSquare &) = delete; + /** Default move constructor */ + CLConvolutionSquare(CLConvolutionSquare &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLConvolutionSquare &operator=(const CLConvolutionSquare &) = delete; + /** Default move assignment operator */ + CLConvolutionSquare &operator=(CLConvolutionSquare &&) = default; + /** Default destructor */ + ~CLConvolutionSquare(); /** Initialize the function's source, destination, conv and border_mode. * * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) @@ -111,13 +126,13 @@ public: void run() override; private: - MemoryGroup _memory_group; /**< Function's memory group */ - CLTensor _tmp; /**< temporary buffer for output of horizontal pass */ - bool _is_separable; /**< true if the convolution can be separated */ - CLSeparableConvolutionHorKernel _kernel_hor; /**< kernel for horizontal pass of separated convolution */ - CLSeparableConvolutionVertKernel _kernel_vert; /**< kernel for vertical pass of separated convolution */ - CLConvolutionKernel _kernel; /**< kernel for non-separated convolution **/ - CLFillBorderKernel _border_handler; /**< kernel for border handling */ + MemoryGroup _memory_group; /**< Function's memory group */ + CLTensor _tmp; /**< temporary buffer for output of horizontal pass */ + bool _is_separable; /**< true if the convolution can be separated */ + std::unique_ptr> _kernel_hor; /**< kernel for horizontal pass of separated convolution */ + std::unique_ptr> _kernel_vert; /**< kernel for vertical pass of separated convolution */ + std::unique_ptr> _kernel; /**< kernel for non-separated convolution **/ + std::unique_ptr _border_handler; /**< kernel for border handling */ }; /** Basic function to run 5x5 convolution. */ diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h index ac36523682..d1de721193 100644 --- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h @@ -73,6 +73,16 @@ class CLConvolutionLayer : public IFunction public: /** Default constructor */ CLConvolutionLayer(std::shared_ptr memory_manager = nullptr); + /** Default Destructor */ + ~CLConvolutionLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLConvolutionLayer(const CLConvolutionLayer &) = delete; + /** Default move constructor */ + CLConvolutionLayer(CLConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLConvolutionLayer &operator=(const CLConvolutionLayer &) = delete; + /** Default move assignment operator */ + CLConvolutionLayer &operator=(CLConvolutionLayer &&) = default; /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h index c20d75eea8..f1a091df84 100644 --- a/arm_compute/runtime/CL/functions/CLCopy.h +++ b/arm_compute/runtime/CL/functions/CLCopy.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; class CLCopy : public ICLSimpleFunction { diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h index e940928b90..e781cfe61f 100644 --- a/arm_compute/runtime/CL/functions/CLCropResize.h +++ b/arm_compute/runtime/CL/functions/CLCropResize.h @@ -25,9 +25,7 @@ #define ARM_COMPUTE_CL_CROP_RESIZE_H #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLCropKernel.h" -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" + #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLScale.h" @@ -37,7 +35,11 @@ namespace arm_compute { // Forward Declarations +class CLCompileContext; +class CLCopyKernel; +class CLCropKernel; class ITensor; +class ITensorInfo; /** Function to perform cropping and resizing */ class CLCropResize : public IFunction @@ -54,7 +56,7 @@ public: /** Allow instances of this class to be moved */ CLCropResize &operator=(CLCropResize &&) = default; /** Default destructor */ - virtual ~CLCropResize() = default; + ~CLCropResize(); /** Configure kernel * diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h index 19a44f7b93..3ebc858d32 100644 --- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h +++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h @@ -24,17 +24,20 @@ #ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H #define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IFunction.h" + +#include namespace arm_compute { // Forward declarations +class CLDeconvolutionLayerUpsampleKernel; +class CLCompileContext; +class CLMemsetKernel; class ICLTensor; +class ITensorInfo; /** Basic function to execute deconvolution upsample on OpenCL. This function calls the following OpenCL kernels and functions: * @@ -55,7 +58,7 @@ public: /** Allow instances of this class to be moved */ CLDeconvolutionLayerUpsample &operator=(CLDeconvolutionLayerUpsample &&) = default; /** Default destructor */ - virtual ~CLDeconvolutionLayerUpsample() = default; + ~CLDeconvolutionLayerUpsample(); /** Initialize the function's source, destination, interpolation type and border_mode. * @@ -86,9 +89,9 @@ public: void run() override; private: - CLDeconvolutionLayerUpsampleKernel _upsample; - CLMemsetKernel _memset; - ICLTensor *_output; + std::unique_ptr _upsample; + std::unique_ptr _memset; + ICLTensor *_output; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H */ diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h index d125584c97..b0f297aec5 100644 --- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLDepthConvertLayerKernel. */ class CLDepthConvertLayer : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h index 5e197cb9b8..a0aa288dbf 100644 --- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h @@ -29,7 +29,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLDepthToSpaceLayerKernel. */ class CLDepthToSpaceLayer : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index 558c4540fa..8e594bc09f 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -24,12 +24,6 @@ #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H #define ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLPermute.h" @@ -38,6 +32,11 @@ namespace arm_compute { +class CLCompileContext; +class CLFillBorderKernel; +class CLDepthwiseConvolutionLayerNativeKernel; +class CLDepthwiseConvolutionLayerReshapeWeightsKernel; +class ICLDepthwiseConvolutionLayer3x3Kernel; class ICLTensor; /** Function to execute a depthwise convolution @@ -55,6 +54,8 @@ public: CLDepthwiseConvolutionLayer &operator=(const CLDepthwiseConvolutionLayer &) = delete; /** Default move assignment operator */ CLDepthwiseConvolutionLayer &operator=(CLDepthwiseConvolutionLayer &&) = default; + /** Default destructor */ + ~CLDepthwiseConvolutionLayer(); /** Initialize the function's source, destination, weights and convolution information. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW @@ -211,25 +212,25 @@ private: }; private: - MemoryGroup _memory_group; - std::unique_ptr _kernel; - CLFillBorderKernel _border_handler; - CLPermute _permute_input_to_nchw; - CLPermute _permute_weights_to_nchw; - CLPermute _permute_output_to_nhwc; - CLDepthwiseConvolutionLayerReshapeWeightsKernel _reshape_weights; - CLTensor _permuted_input; - CLTensor _permuted_weights; - CLTensor _permuted_output; - CLTensor _output_multipliers; - CLTensor _output_shifts; - const ITensor *_original_weights; - const ITensor *_input; - const ITensor *_output; - bool _needs_permute; - bool _needs_weights_reshape; - bool _is_prepared; - bool _is_quantized; + MemoryGroup _memory_group; + std::unique_ptr _kernel; + std::unique_ptr _border_handler; + CLPermute _permute_input_to_nchw; + CLPermute _permute_weights_to_nchw; + CLPermute _permute_output_to_nhwc; + std::unique_ptr _reshape_weights; + CLTensor _permuted_input; + CLTensor _permuted_weights; + CLTensor _permuted_output; + CLTensor _output_multipliers; + CLTensor _output_shifts; + const ITensor *_original_weights; + const ITensor *_input; + const ITensor *_output; + bool _needs_permute; + bool _needs_weights_reshape; + bool _is_prepared; + bool _is_quantized; }; /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels: @@ -313,10 +314,10 @@ private: private: MemoryGroup _memory_group; - CLDepthwiseConvolutionLayerNativeKernel _dwc_native_kernel; - CLPermute _permute_input_to_nhwc; - CLPermute _permute_weights_to_nhwc; - CLPermute _permute_output_to_nchw; + std::unique_ptr _dwc_native_kernel; + CLPermute _permute_input_to_nhwc; + CLPermute _permute_weights_to_nhwc; + CLPermute _permute_output_to_nchw; CLTensor _permuted_input; CLTensor _permuted_weights; diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h index 88ed915421..b2cf3356f4 100644 --- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h @@ -31,7 +31,9 @@ namespace arm_compute { // Forward declarations +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLDequantizationLayerKernel that dequantizes an input tensor */ class CLDequantizationLayer : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h index 1aba6a9f6c..4a91d5d50b 100644 --- a/arm_compute/runtime/CL/functions/CLDerivative.h +++ b/arm_compute/runtime/CL/functions/CLDerivative.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute first order derivative operator. This function calls the following CL kernels: diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h index adb9cf4e6c..bf72cd3b26 100644 --- a/arm_compute/runtime/CL/functions/CLDilate.h +++ b/arm_compute/runtime/CL/functions/CLDilate.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute dilate. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h index 8107fa24f3..0afc9d3f38 100644 --- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H #define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H -#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" #include "arm_compute/runtime/IFunction.h" @@ -34,7 +32,11 @@ namespace arm_compute { +class CLCompileContext; +class CLDirectConvolutionLayerKernel; +class CLFillBorderKernel; class ICLTensor; +class ITensorInfo; /** Basic function to execute direct convolution function: */ @@ -43,6 +45,12 @@ class CLDirectConvolutionLayer : public IFunction public: /** Default constructor */ CLDirectConvolutionLayer(); + /** Prevent instances of this class from being copied */ + CLDirectConvolutionLayer(const CLDirectConvolutionLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLDirectConvolutionLayer &operator=(const CLDirectConvolutionLayer &) = delete; + /** Default destructor */ + ~CLDirectConvolutionLayer(); /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -95,9 +103,9 @@ public: void run() override; private: - CLDirectConvolutionLayerKernel _direct_conv_kernel; - CLFillBorderKernel _input_border_handler; - CLActivationLayer _activationlayer_function; + std::unique_ptr _direct_conv_kernel; + std::unique_ptr _input_border_handler; + CLActivationLayer _activationlayer_function; bool _is_activationlayer_enabled; }; diff --git a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h index 5208bfe404..72b5b7dee8 100644 --- a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h +++ b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h @@ -29,7 +29,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to perform inverse square root on an input tensor. */ class CLRsqrtLayer : public IFunction diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h index 31d4f2e745..55c5fb3455 100644 --- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h +++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h @@ -30,6 +30,8 @@ namespace arm_compute { class ICLTensor; +class CLCompileContext; +class ITensorInfo; namespace experimental { diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h index 883f330b33..17352d1a9b 100644 --- a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h +++ b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h @@ -24,16 +24,19 @@ #ifndef ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H #define ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H -#include "arm_compute/core/CL/kernels/CLHistogramKernel.h" -#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h" #include "arm_compute/runtime/CL/CLDistribution1D.h" #include "arm_compute/runtime/CL/CLLut.h" #include "arm_compute/runtime/IFunction.h" #include +#include namespace arm_compute { +class CLCompileContext; +class CLHistogramKernel; +class CLHistogramBorderKernel; +class CLTableLookupKernel; class ICLTensor; using ICLImage = ICLTensor; @@ -48,6 +51,12 @@ class CLEqualizeHistogram : public IFunction public: /** Default Constructor. */ CLEqualizeHistogram(); + /** Prevent instances of this class from being copied */ + CLEqualizeHistogram(const CLEqualizeHistogram &) = delete; + /** Prevent instances of this class from being copied */ + CLEqualizeHistogram &operator=(const CLEqualizeHistogram &) = delete; + /** Default destructor */ + ~CLEqualizeHistogram(); /** Initialise the kernel's inputs. * * @param[in] input Input image. Data types supported: U8. @@ -66,14 +75,14 @@ public: void run() override; private: - CLHistogramKernel _histogram_kernel; /**< Kernel that calculates the histogram of input. */ - CLHistogramBorderKernel _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */ - CLTableLookupKernel _map_histogram_kernel; /**< Kernel that maps the input to output using the lut. */ - CLDistribution1D _hist; /**< Distribution that holds the histogram of the input image. */ - CLDistribution1D _cum_dist; /**< Distribution that holds the cummulative distribution of the input histogram. */ - CLLut _cd_lut; /**< Holds the equalization lookuptable. */ - static const uint32_t max_range = 256; /**< Histogram range of the internal histograms. */ - static const uint32_t nr_bins = 256; /**< Histogram bins of the internal histograms. */ + std::unique_ptr _histogram_kernel; /**< Kernel that calculates the histogram of input. */ + std::unique_ptr _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */ + std::unique_ptr _map_histogram_kernel; /**< Kernel that maps the input to output using the lut. */ + CLDistribution1D _hist; /**< Distribution that holds the histogram of the input image. */ + CLDistribution1D _cum_dist; /**< Distribution that holds the cummulative distribution of the input histogram. */ + CLLut _cd_lut; /**< Holds the equalization lookuptable. */ + static const uint32_t max_range = 256; /**< Histogram range of the internal histograms. */ + static const uint32_t nr_bins = 256; /**< Histogram bins of the internal histograms. */ }; } #endif /*ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H */ diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h index f8f1c72bc0..9d799bc91e 100644 --- a/arm_compute/runtime/CL/functions/CLErode.h +++ b/arm_compute/runtime/CL/functions/CLErode.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute erode. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h index a6a35ab320..31a2cc6b06 100644 --- a/arm_compute/runtime/CL/functions/CLFFT1D.h +++ b/arm_compute/runtime/CL/functions/CLFFT1D.h @@ -26,9 +26,6 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h" -#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h" -#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -36,6 +33,9 @@ namespace arm_compute { // Forward declaration +class CLFFTDigitReverseKernel; +class CLFFTRadixStageKernel; +class CLFFTScaleKernel; class ICLTensor; /** Basic function to execute one dimensional FFT. This function calls the following OpenCL kernels: @@ -49,6 +49,12 @@ class CLFFT1D : public IFunction public: /** Default Constructor */ CLFFT1D(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLFFT1D(const CLFFT1D &) = delete; + /** Prevent instances of this class from being copied */ + CLFFT1D &operator=(const CLFFT1D &) = delete; + /** Default destructor */ + ~CLFFT1D(); /** Initialise the function's source, destinations and border mode. * * @param[in] input Source tensor. Data types supported: F32. @@ -78,14 +84,14 @@ public: void run() override; protected: - MemoryGroup _memory_group; - CLFFTDigitReverseKernel _digit_reverse_kernel; - std::vector _fft_kernels; - CLFFTScaleKernel _scale_kernel; - CLTensor _digit_reversed_input; - CLTensor _digit_reverse_indices; - unsigned int _num_ffts; - bool _run_scale; + MemoryGroup _memory_group; + std::unique_ptr _digit_reverse_kernel; + std::vector> _fft_kernels; + std::unique_ptr _scale_kernel; + CLTensor _digit_reversed_input; + CLTensor _digit_reverse_indices; + unsigned int _num_ffts; + bool _run_scale; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLFFT1D_H */ diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h index 9ceebeaa32..126944b323 100644 --- a/arm_compute/runtime/CL/functions/CLFFT2D.h +++ b/arm_compute/runtime/CL/functions/CLFFT2D.h @@ -46,6 +46,12 @@ class CLFFT2D : public IFunction public: /** Default Constructor */ CLFFT2D(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLFFT2D(const CLFFT2D &) = delete; + /** Prevent instances of this class from being copied */ + CLFFT2D &operator=(const CLFFT2D &) = delete; + /** Default destructor */ + ~CLFFT2D(); /** Initialise the function's source, destinations and border mode. * * @param[in] input Source tensor. Data types supported: F32. diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h index 698cc67995..e110582c50 100644 --- a/arm_compute/runtime/CL/functions/CLFastCorners.h +++ b/arm_compute/runtime/CL/functions/CLFastCorners.h @@ -25,7 +25,6 @@ #define ARM_COMPUTE_CLFASTCORNERS_H #include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" #include "arm_compute/runtime/CL/CLArray.h" @@ -40,6 +39,8 @@ namespace arm_compute { +class CLFastCornersKernel; +class CLCopyToArrayKernel; class ICLTensor; using ICLImage = ICLTensor; @@ -59,6 +60,8 @@ public: CLFastCorners(const CLFastCorners &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ const CLFastCorners &operator=(const CLFastCorners &) = delete; + /** Default destructor */ + ~CLFastCorners(); /** Initialize the function's source, destination, conv and border_mode. * * @param[in] input Source image. Data types supported: U8. @@ -88,18 +91,18 @@ public: void run() override; private: - MemoryGroup _memory_group; - CLFastCornersKernel _fast_corners_kernel; - CLNonMaximaSuppression3x3 _suppr_func; - CLCopyToArrayKernel _copy_array_kernel; - CLImage _output; - CLImage _suppr; - Window _win; - bool _non_max; - unsigned int *_num_corners; - cl::Buffer _num_buffer; - ICLKeyPointArray *_corners; - uint8_t _constant_border_value; + MemoryGroup _memory_group; + std::unique_ptr _fast_corners_kernel; + CLNonMaximaSuppression3x3 _suppr_func; + std::unique_ptr _copy_array_kernel; + CLImage _output; + CLImage _suppr; + Window _win; + bool _non_max; + unsigned int *_num_corners; + cl::Buffer _num_buffer; + ICLKeyPointArray *_corners; + uint8_t _constant_border_value; }; } #endif /*ARM_COMPUTE_CLFASTCORNERS_H */ diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h index b79b234158..fef8324432 100644 --- a/arm_compute/runtime/CL/functions/CLFill.h +++ b/arm_compute/runtime/CL/functions/CLFill.h @@ -30,6 +30,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Function to run @ref CLMemsetKernel to fill a tensor with a scalar value */ diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h index 18bc20e654..a4ad82dfd4 100644 --- a/arm_compute/runtime/CL/functions/CLFillBorder.h +++ b/arm_compute/runtime/CL/functions/CLFillBorder.h @@ -30,6 +30,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLFillBorderKernel */ diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h index b8139c2260..f5f4ff554f 100644 --- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h +++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h @@ -29,7 +29,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to execute flatten. This function calls the following OpenCL kernel: * diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h index 93c3639f89..85d7071194 100644 --- a/arm_compute/runtime/CL/functions/CLFloor.h +++ b/arm_compute/runtime/CL/functions/CLFloor.h @@ -30,7 +30,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLFloorKernel */ class CLFloor : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 29788742d7..3f17e4a921 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -26,7 +26,6 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h index de6d5617c2..e35905fcf1 100644 --- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h +++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h @@ -24,14 +24,18 @@ #ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H #define ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H -#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include + namespace arm_compute { // Forward declarations +class CLCompileContext; +class CLFuseBatchNormalizationKernel; class ICLTensor; +class ITensorInfo; /** Basic function to fuse the batch normalization node to a preceding convolution node */ class CLFuseBatchNormalization : public IFunction @@ -48,7 +52,7 @@ public: /** Allow instances of this class to be moved */ CLFuseBatchNormalization &operator=(CLFuseBatchNormalization &&) = default; /** Default destructor */ - ~CLFuseBatchNormalization() = default; + ~CLFuseBatchNormalization(); /** Set the input and output tensors. * * @param[in] input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -112,7 +116,7 @@ public: void run() override; private: - CLFuseBatchNormalizationKernel _fuse_bn_kernel; + std::unique_ptr _fuse_bn_kernel; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H */ diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h index 92f9736e35..0b13e7dbbf 100644 --- a/arm_compute/runtime/CL/functions/CLGEMM.h +++ b/arm_compute/runtime/CL/functions/CLGEMM.h @@ -24,11 +24,6 @@ #ifndef ARM_COMPUTE_CLGEMM_H #define ARM_COMPUTE_CLGEMM_H -#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTypes.h" #include "arm_compute/runtime/IFunction.h" @@ -36,9 +31,18 @@ #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include + namespace arm_compute { +class CLCompileContext; +class CLGEMMReshapeRHSMatrixKernel; +class CLGEMMMatrixMultiplyKernel; +class CLGEMMMatrixMultiplyReshapedKernel; +class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel; +class CLGEMMReshapeLHSMatrixKernel; class ICLTensor; +class ITensorInfo; namespace weights_transformations { @@ -46,41 +50,36 @@ namespace weights_transformations class CLGEMMReshapeRHSMatrixKernelManaged : public ITransformWeights { public: + /** Default constructor */ + CLGEMMReshapeRHSMatrixKernelManaged(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMReshapeRHSMatrixKernelManaged(const CLGEMMReshapeRHSMatrixKernelManaged &) = delete; + /** Default move constructor */ + CLGEMMReshapeRHSMatrixKernelManaged(CLGEMMReshapeRHSMatrixKernelManaged &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMReshapeRHSMatrixKernelManaged &operator=(const CLGEMMReshapeRHSMatrixKernelManaged &) = delete; + /** Default move assignment operator */ + CLGEMMReshapeRHSMatrixKernelManaged &operator=(CLGEMMReshapeRHSMatrixKernelManaged &&) = default; + /** Default desctructor */ + ~CLGEMMReshapeRHSMatrixKernelManaged(); //Inherited method override - void run() override - { - _output.allocator()->allocate(); - CLScheduler::get().enqueue(_kernel, false); - _reshape_run = true; - } + void run() override; //Inherited method override - void release() override - { - _output.allocator()->free(); - } + void release() override; //Inherited method override - ICLTensor *get_weights() override - { - return &_output; - } + ICLTensor *get_weights() override; //Inherited method override - uint32_t uid() override - { - return _uid; - } + uint32_t uid() override; /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel * * @param[in] input Input tensor. Data types supported: All * @param[in] info RHS matrix information to be used for reshaping. */ - void configure(const ICLTensor *input, GEMMRHSMatrixInfo info) - { - configure(CLKernelLibrary::get().get_compile_context(), input, info); - } + void configure(const ICLTensor *input, GEMMRHSMatrixInfo info); /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel * @@ -88,15 +87,12 @@ public: * @param[in] input Input tensor. Data types supported: All * @param[in] info RHS matrix information to be used for reshaping. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info) - { - _kernel.configure(compile_context, input, &_output, info); - } + void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info); private: - static constexpr uint32_t _uid = 0x15; - CLTensor _output{}; - CLGEMMReshapeRHSMatrixKernel _kernel{}; + static constexpr uint32_t _uid{ 0x15 }; + CLTensor _output{}; + std::unique_ptr _kernel; }; } // namespace weights_transformations @@ -126,6 +122,8 @@ public: CLGEMM &operator=(const CLGEMM &) = delete; /** Default move assignment operator */ CLGEMM &operator=(CLGEMM &&) = default; + /** Default destructor */ + ~CLGEMM(); /** Initialise the kernel's inputs and output * * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. @@ -198,24 +196,24 @@ private: static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - CLGEMMMatrixMultiplyKernel _mm_kernel; - CLGEMMReshapeLHSMatrixKernel _reshape_lhs_kernel; - CLGEMMReshapeRHSMatrixKernel _reshape_rhs_kernel; - weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged _reshape_rhs_kernel_managed; - CLGEMMMatrixMultiplyReshapedKernel _mm_reshaped_kernel; - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel; - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_fallback_kernel; - CLTensor _tmp_a; - CLTensor _tmp_b; - const ICLTensor *_original_b; - const ICLTensor *_lhs; - ICLTensor *_dst; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _has_pad_y; - CLGEMMKernelType _gemm_kernel_type; + MemoryGroup _memory_group; + IWeightsManager *_weights_manager; + std::unique_ptr _mm_kernel; + std::unique_ptr _reshape_lhs_kernel; + std::unique_ptr _reshape_rhs_kernel; + std::unique_ptr _reshape_rhs_kernel_managed; + std::unique_ptr _mm_reshaped_kernel; + std::unique_ptr _mm_reshaped_only_rhs_kernel; + std::unique_ptr _mm_reshaped_only_rhs_fallback_kernel; + CLTensor _tmp_a; + CLTensor _tmp_b; + const ICLTensor *_original_b; + const ICLTensor *_lhs; + ICLTensor *_dst; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _has_pad_y; + CLGEMMKernelType _gemm_kernel_type; }; } // namespace arm_compute diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h index 467045cd86..340ac6e749 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h @@ -26,9 +26,7 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h" -#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" -#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" @@ -43,6 +41,9 @@ namespace arm_compute { +class CLCol2ImKernel; +class CLIm2ColKernel; +class CLWeightsReshapeKernel; class ICLTensor; /** Function to reshape and transpose the weights. This function calls the following kernels: @@ -53,6 +54,16 @@ class CLConvolutionLayerReshapeWeights : public IFunction public: /** Constructor */ CLConvolutionLayerReshapeWeights(); + /** Prevent instances of this class from being copied */ + CLConvolutionLayerReshapeWeights(const CLConvolutionLayerReshapeWeights &) = delete; + /** Prevent instances of this class from being copied */ + CLConvolutionLayerReshapeWeights &operator=(const CLConvolutionLayerReshapeWeights &) = delete; + /** Prevent instances of this class to be moved */ + CLConvolutionLayerReshapeWeights(CLConvolutionLayerReshapeWeights &&) = delete; + /** Prevent instances of this class to be moved */ + CLConvolutionLayerReshapeWeights &operator=(CLConvolutionLayerReshapeWeights &&) = delete; + /** Default destructor */ + ~CLConvolutionLayerReshapeWeights(); /** Set the input and output tensors. * * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. @@ -87,7 +98,7 @@ public: void run() override; private: - CLWeightsReshapeKernel _weights_reshape_kernel; + std::unique_ptr _weights_reshape_kernel; }; namespace weights_transformations @@ -179,6 +190,8 @@ public: CLGEMMConvolutionLayer &operator=(const CLGEMMConvolutionLayer &) = delete; /** Default move assignment operator */ CLGEMMConvolutionLayer &operator=(CLGEMMConvolutionLayer &&) = default; + /**Default destructor */ + ~CLGEMMConvolutionLayer(); /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -288,10 +301,10 @@ private: IWeightsManager *_weights_manager; CLConvolutionLayerReshapeWeights _reshape_weights; weights_transformations::CLConvolutionLayerReshapeWeightsTransform _reshape_weights_managed; - CLIm2ColKernel _im2col_kernel; + std::unique_ptr _im2col_kernel; CLGEMM _mm_gemm; CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - CLCol2ImKernel _col2im_kernel; + std::unique_ptr _col2im_kernel; CLActivationLayer _activationlayer_function; const ICLTensor *_original_weights; diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h index 1fedeff444..32af0f9427 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h @@ -24,7 +24,6 @@ #ifndef ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H #define ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H -#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" @@ -40,6 +39,7 @@ namespace arm_compute { +class CLDeconvolutionReshapeOutputKernel; class ICLTensor; /** Function to run the deconvolution layer through a call to GEMM. * @@ -89,6 +89,8 @@ public: CLGEMMDeconvolutionLayer &operator=(const CLGEMMDeconvolutionLayer &) = delete; /** Default move assignment operator */ CLGEMMDeconvolutionLayer &operator=(CLGEMMDeconvolutionLayer &&) = default; + /** Default desctructor */ + ~CLGEMMDeconvolutionLayer(); /** Set the input, weights, biases and output tensors. * * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. @@ -130,15 +132,15 @@ public: private: MemoryGroup _memory_group; - CLGEMM _mm_gemm; - CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - CLGEMMLowpOutputStage _gemmlowp_output_stage; - CLPermute _permute_input_to_nhwc; - CLPermute _permute_weights_to_nhwc; - CLReshapeLayer _reshape_weights; - CLTranspose _transpose_weights; - CLDeconvolutionReshapeOutputKernel _deconv_reshape; - CLSlice _slice_gemm; + CLGEMM _mm_gemm; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLGEMMLowpOutputStage _gemmlowp_output_stage; + CLPermute _permute_input_to_nhwc; + CLPermute _permute_weights_to_nhwc; + CLReshapeLayer _reshape_weights; + CLTranspose _transpose_weights; + std::unique_ptr _deconv_reshape; + CLSlice _slice_gemm; CLTensor _gemmlowp_final; CLTensor _reshaped_weights; diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h index 57b1e30df5..4cc8899690 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -24,21 +24,24 @@ #ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H -#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" namespace arm_compute { +class CLCompileContext; class IMemoryManager; class ICLTensor; +class ITensorInfo; +class CLDepthConvertLayerKernel; +class CLGEMMLowpMatrixMultiplyNativeKernel; +class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel; +class CLGEMMLowpOffsetContributionKernel; +class CLGEMMLowpOffsetContributionOutputStageKernel; +class CLGEMMLowpMatrixAReductionKernel; +class CLGEMMLowpMatrixBReductionKernel; +class CLGEMMReshapeRHSMatrixKernel; /** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */ class CLGEMMLowpMatrixMultiplyCore : public IFunction @@ -54,6 +57,8 @@ public: CLGEMMLowpMatrixMultiplyCore &operator=(const CLGEMMLowpMatrixMultiplyCore &) = delete; /** Default move assignment operator */ CLGEMMLowpMatrixMultiplyCore &operator=(CLGEMMLowpMatrixMultiplyCore &&) = default; + /** Default destructor */ + ~CLGEMMLowpMatrixMultiplyCore(); /** Initialise the kernel's inputs, output * * @note GEMMLowp: low precision GEMM kernel. [A * B + C] @@ -112,14 +117,14 @@ private: MemoryGroup _memory_group; // Kernels used - CLDepthConvertLayerKernel _weights_to_qasymm8; - CLGEMMLowpMatrixMultiplyNativeKernel _mm_native_kernel; - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel; - CLGEMMReshapeRHSMatrixKernel _mtx_b_reshape_kernel; - CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; - CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; - CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel; - CLGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; + std::unique_ptr _weights_to_qasymm8; + std::unique_ptr _mm_native_kernel; + std::unique_ptr _mm_reshaped_only_rhs_kernel; + std::unique_ptr _mtx_b_reshape_kernel; + std::unique_ptr _mtx_a_reduction_kernel; + std::unique_ptr _mtx_b_reduction_kernel; + std::unique_ptr _offset_contribution_kernel; + std::unique_ptr _offset_contribution_output_stage_kernel; // Temporary tensors CLTensor _qasymm8_weights; diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h index 44c52ffb79..a4edab9b8f 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h @@ -24,8 +24,11 @@ #ifndef ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H #define ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include + /** This file contains all available output stages for GEMMLowp on OpenCL. * * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore), @@ -36,7 +39,11 @@ namespace arm_compute { +class CLCompileContext; class ITensor; +class ICLTensor; +class ITensorInfo; +struct GEMMLowpOutputStageInfo; /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL. * diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h index e87a120ba1..9c659be6fc 100644 --- a/arm_compute/runtime/CL/functions/CLGather.h +++ b/arm_compute/runtime/CL/functions/CLGather.h @@ -25,11 +25,14 @@ #ifndef ARM_COMPUTE_CLGATHER_H #define ARM_COMPUTE_CLGATHER_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLGatherKernel */ class CLGather : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h index 9fe3e9bb00..286a17618b 100644 --- a/arm_compute/runtime/CL/functions/CLGaussian3x3.h +++ b/arm_compute/runtime/CL/functions/CLGaussian3x3.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute gaussian filter 3x3. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h index fb369d750b..cf5b79eaac 100644 --- a/arm_compute/runtime/CL/functions/CLGaussian5x5.h +++ b/arm_compute/runtime/CL/functions/CLGaussian5x5.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLGAUSSIAN5X5_H #define ARM_COMPUTE_CLGAUSSIAN5X5_H -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" @@ -37,6 +35,10 @@ namespace arm_compute { +class CLCompileContext; +class CLFillBorderKernel; +class CLGaussian5x5HorKernel; +class CLGaussian5x5VertKernel; class ICLTensor; /** Basic function to execute gaussian filter 5x5. This function calls the following OpenCL kernels: @@ -54,6 +56,16 @@ public: * @param[in] memory_manager (Optional) Memory manager. */ CLGaussian5x5(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLGaussian5x5(const CLGaussian5x5 &) = delete; + /** Default move constructor */ + CLGaussian5x5(CLGaussian5x5 &&) = default; + /** Prevent instances of this class from being copied */ + CLGaussian5x5 &operator=(const CLGaussian5x5 &) = delete; + /** Default move assignment operator */ + CLGaussian5x5 &operator=(CLGaussian5x5 &&) = default; + /** Default destructor */ + ~CLGaussian5x5(); /** Initialise the function's source, destinations and border mode. * * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) @@ -76,11 +88,11 @@ public: void run() override; protected: - MemoryGroup _memory_group; /**< Function's memory group */ - CLGaussian5x5HorKernel _kernel_hor; /**< Horizontal pass kernel */ - CLGaussian5x5VertKernel _kernel_vert; /**< Vertical pass kernel */ - CLFillBorderKernel _border_handler; /**< Kernel to handle image borders */ - CLImage _tmp; /**< Temporary buffer */ + MemoryGroup _memory_group; /**< Function's memory group */ + std::unique_ptr _kernel_hor; /**< Horizontal pass kernel */ + std::unique_ptr _kernel_vert; /**< Vertical pass kernel */ + std::unique_ptr _border_handler; /**< Kernel to handle image borders */ + CLImage _tmp; /**< Temporary buffer */ }; } #endif /*ARM_COMPUTE_CLGAUSSIAN5X5_H */ diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h index 70f324be11..b18e5f98f0 100644 --- a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h +++ b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h @@ -24,9 +24,6 @@ #ifndef ARM_COMPUTE_CLGAUSSIANPYRAMID_H #define ARM_COMPUTE_CLGAUSSIANPYRAMID_H -#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h" - -#include "arm_compute/core/CL/kernels/CLScaleKernel.h" #include "arm_compute/core/IPyramid.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLPyramid.h" @@ -38,7 +35,12 @@ namespace arm_compute { +class CLCompileContext; +class CLFillBorderKernel; class ICLTensor; +class CLGaussianPyramidHorKernel; +class CLGaussianPyramidVertKernel; +class CLScaleKernel; /** Common interface for all Gaussian pyramid functions */ class CLGaussianPyramid : public IFunction @@ -55,7 +57,7 @@ public: /** Allow instances of this class to be moved */ CLGaussianPyramid &operator=(CLGaussianPyramid &&) = default; /** Default destructor */ - virtual ~CLGaussianPyramid() = default; + ~CLGaussianPyramid(); /** Initialise the function's source, destinations and border mode. * * @param[in, out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) @@ -93,6 +95,12 @@ class CLGaussianPyramidHalf : public CLGaussianPyramid public: /** Constructor */ CLGaussianPyramidHalf(); + /** Prevent instances of this class from being copied */ + CLGaussianPyramidHalf(const CLGaussianPyramidHalf &) = delete; + /** Prevent instances of this class from being copied */ + CLGaussianPyramidHalf &operator=(const CLGaussianPyramidHalf &) = delete; + /** Default destructor */ + ~CLGaussianPyramidHalf(); // Inherited methods overridden: void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override; @@ -100,10 +108,10 @@ public: void run() override; private: - std::vector _horizontal_border_handler; - std::vector _vertical_border_handler; - std::vector _horizontal_reduction; - std::vector _vertical_reduction; + std::vector> _horizontal_border_handler; + std::vector> _vertical_border_handler; + std::vector> _horizontal_reduction; + std::vector> _vertical_reduction; }; /** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following OpenCL kernels and functions: @@ -124,8 +132,8 @@ public: void run() override; private: - std::vector _gauss5x5; - std::vector _scale_nearest; + std::vector _gauss5x5; + std::vector> _scale_nearest; }; } #endif /*ARM_COMPUTE_CLGAUSSIANPYRAMID_H */ diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h index 6d5f2e5d71..0fb9a06c84 100644 --- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h +++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h @@ -23,12 +23,7 @@ */ #ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYER_H #define ARM_COMPUTE_CLGENERATEPROPOSALSLAYER_H -#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h" -#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLPermuteKernel.h" -#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h" + #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" @@ -38,9 +33,19 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" +#include + namespace arm_compute { +class CLCompileContext; +class CLBoundingBoxTransformKernel; +class CLDequantizationLayerKernel; +class CLComputeAllAnchorsKernel; +class CLPadLayerKernel; +class CLPermuteKernel; +class CLQuantizationLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to generate proposals for a RPN (Region Proposal Network) * @@ -67,6 +72,8 @@ public: CLGenerateProposalsLayer(const CLGenerateProposalsLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ CLGenerateProposalsLayer &operator=(const CLGenerateProposalsLayer &) = delete; + /** Default destructor */ + ~CLGenerateProposalsLayer(); /** Set the input and output tensors. * @@ -130,16 +137,16 @@ private: MemoryGroup _memory_group; // OpenCL kernels - CLPermuteKernel _permute_deltas_kernel; - CLReshapeLayer _flatten_deltas; - CLPermuteKernel _permute_scores_kernel; - CLReshapeLayer _flatten_scores; - CLComputeAllAnchorsKernel _compute_anchors_kernel; - CLBoundingBoxTransformKernel _bounding_box_kernel; - CLPadLayerKernel _pad_kernel; - CLDequantizationLayerKernel _dequantize_anchors; - CLDequantizationLayerKernel _dequantize_deltas; - CLQuantizationLayerKernel _quantize_all_proposals; + std::unique_ptr _permute_deltas_kernel; + CLReshapeLayer _flatten_deltas; + std::unique_ptr _permute_scores_kernel; + CLReshapeLayer _flatten_scores; + std::unique_ptr _compute_anchors_kernel; + std::unique_ptr _bounding_box_kernel; + std::unique_ptr _pad_kernel; + std::unique_ptr _dequantize_anchors; + std::unique_ptr _dequantize_deltas; + std::unique_ptr _quantize_all_proposals; // CPP functions CPPBoxWithNonMaximaSuppressionLimit _cpp_nms; diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h index dad7e6edf8..fa37b3c84e 100644 --- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h +++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h @@ -24,7 +24,6 @@ #ifndef ARM_COMPUTE_CLHOGDESCRIPTOR_H #define ARM_COMPUTE_CLHOGDESCRIPTOR_H -#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLHOGGradient.h" @@ -37,6 +36,8 @@ namespace arm_compute { class IHOG; +class CLHOGOrientationBinningKernel; +class CLHOGBlockNormalizationKernel; /** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels: * * -# @ref CLHOGGradient @@ -49,6 +50,12 @@ class CLHOGDescriptor : public IFunction public: /** Default constructor */ CLHOGDescriptor(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLHOGDescriptor(const CLHOGDescriptor &) = delete; + /** Prevent instances of this class from being copied */ + CLHOGDescriptor &operator=(const CLHOGDescriptor &) = delete; + /** Default destructor */ + ~CLHOGDescriptor(); /** Initialise the function's source, destination, HOG data-object and border mode * * @param[in, out] input Input tensor. Data type supported: U8 @@ -75,13 +82,13 @@ public: void run() override; private: - MemoryGroup _memory_group; - CLHOGGradient _gradient; - CLHOGOrientationBinningKernel _orient_bin; - CLHOGBlockNormalizationKernel _block_norm; - CLTensor _mag; - CLTensor _phase; - CLTensor _hog_space; + MemoryGroup _memory_group; + CLHOGGradient _gradient; + std::unique_ptr _orient_bin; + std::unique_ptr _block_norm; + CLTensor _mag; + CLTensor _phase; + CLTensor _hog_space; }; } diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h index 6697b5c24d..edc5b652d3 100644 --- a/arm_compute/runtime/CL/functions/CLHOGDetector.h +++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h @@ -24,13 +24,20 @@ #ifndef ARM_COMPUTE_CLHOGDETECTOR_H #define ARM_COMPUTE_CLHOGDETECTOR_H +#include "arm_compute/core/CL/ICLArray.h" #include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h" #include "arm_compute/core/IHOG.h" #include "arm_compute/runtime/IFunction.h" +#include + namespace arm_compute { +class CLCompileContext; +class CLHOGDetectorKernel; +class ICLTensor; +class ICLHOG; + /** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel: * * -# @ref CLHOGDetectorKernel @@ -50,7 +57,7 @@ public: /** Allow instances of this class to be moved */ CLHOGDetector &operator=(CLHOGDetector &&) = default; /** Default destructor */ - ~CLHOGDetector() = default; + ~CLHOGDetector(); /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class * * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it. @@ -78,16 +85,16 @@ public: * @param[in] idx_class (Optional) Index of the class used for evaluating which class the detection window belongs to */ void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, - float threshold = 0.0f, + float threshold = 0.0f, size_t idx_class = 0); // Inherited methods overridden: void run() override; private: - CLHOGDetectorKernel _hog_detector_kernel; - ICLDetectionWindowArray *_detection_windows; - cl::Buffer _num_detection_windows; + std::unique_ptr _hog_detector_kernel; + ICLDetectionWindowArray *_detection_windows; + cl::Buffer _num_detection_windows; }; } diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h index b0589027e7..39d26fb110 100644 --- a/arm_compute/runtime/CL/functions/CLHOGGradient.h +++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h @@ -24,9 +24,6 @@ #ifndef ARM_COMPUTE_CLHOGGRADIENT_H #define ARM_COMPUTE_CLHOGGRADIENT_H -#include "arm_compute/core/CL/ICLKernel.h" - -#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLDerivative.h" @@ -39,6 +36,9 @@ namespace arm_compute { +class CLCompileContext; +class CLMagnitudePhaseKernel; +class ITensorInfo; /** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels: * * -# @ref CLDerivative @@ -79,11 +79,11 @@ public: void run() override; private: - MemoryGroup _memory_group; - CLDerivative _derivative; - CLMagnitudePhaseKernel _mag_phase; - CLTensor _gx; - CLTensor _gy; + MemoryGroup _memory_group; + CLDerivative _derivative; + std::unique_ptr _mag_phase; + CLTensor _gx; + CLTensor _gy; }; } #endif /*ARM_COMPUTE_CLHOGGRADIENT_H */ diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h index e7631c2c5a..2a2c9a0a5c 100644 --- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h +++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h @@ -26,7 +26,6 @@ #include "arm_compute/core/CL/ICLArray.h" #include "arm_compute/core/CL/ICLMultiHOG.h" -#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h" #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLHOGDetector.h" @@ -39,6 +38,9 @@ namespace arm_compute { +class CLCompileContext; +class CLHOGOrientationBinningKernel; +class CLHOGBlockNormalizationKernel; /** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels: * * -# @ref CLHOGGradient @@ -62,6 +64,8 @@ public: CLHOGMultiDetection(const CLHOGMultiDetection &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete; + /** Default destructor */ + ~CLHOGMultiDetection(); /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression * * @param[in, out] input Input tensor. Data type supported: U8 @@ -110,21 +114,21 @@ public: void run() override; private: - MemoryGroup _memory_group; - CLHOGGradient _gradient_kernel; - std::vector _orient_bin_kernel; - std::vector _block_norm_kernel; - std::vector _hog_detect_kernel; - CPPDetectionWindowNonMaximaSuppressionKernel _non_maxima_kernel; - std::vector _hog_space; - std::vector _hog_norm_space; - ICLDetectionWindowArray *_detection_windows; - CLTensor _mag; - CLTensor _phase; - bool _non_maxima_suppression; - size_t _num_orient_bin_kernel; - size_t _num_block_norm_kernel; - size_t _num_hog_detect_kernel; + MemoryGroup _memory_group; + CLHOGGradient _gradient_kernel; + std::vector> _orient_bin_kernel; + std::vector> _block_norm_kernel; + std::vector _hog_detect_kernel; + CPPDetectionWindowNonMaximaSuppressionKernel _non_maxima_kernel; + std::vector _hog_space; + std::vector _hog_norm_space; + ICLDetectionWindowArray *_detection_windows; + CLTensor _mag; + CLTensor _phase; + bool _non_maxima_suppression; + size_t _num_orient_bin_kernel; + size_t _num_block_norm_kernel; + size_t _num_hog_detect_kernel; }; } diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h index 326a895d39..c9c67f5a28 100644 --- a/arm_compute/runtime/CL/functions/CLHarrisCorners.h +++ b/arm_compute/runtime/CL/functions/CLHarrisCorners.h @@ -24,16 +24,13 @@ #ifndef ARM_COMPUTE_CLHARRISCORNERS_H #define ARM_COMPUTE_CLHARRISCORNERS_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h" #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h" #include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include @@ -41,6 +38,9 @@ namespace arm_compute { +class CLCompileContext; +class CLHarrisScoreKernel; +class CLFillBorderKernel; class ICLTensor; using ICLImage = ICLTensor; @@ -66,6 +66,8 @@ public: CLHarrisCorners(const CLHarrisCorners &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ const CLHarrisCorners &operator=(const CLHarrisCorners &) = delete; + /** Default destructor */ + ~CLHarrisCorners(); /** Initialize the function's source, destination, conv and border_mode. * * @param[in,out] input Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) @@ -104,21 +106,21 @@ public: void run() override; private: - MemoryGroup _memory_group; /**< Function's memory group */ - std::unique_ptr _sobel; /**< Sobel function */ - CLHarrisScoreKernel _harris_score; /**< Harris score kernel */ - CLNonMaximaSuppression3x3 _non_max_suppr; /**< Non-maxima suppression function */ - CPPCornerCandidatesKernel _candidates; /**< Sort kernel */ - CPPSortEuclideanDistanceKernel _sort_euclidean; /**< Euclidean distance kernel */ - CLFillBorderKernel _border_gx; /**< Border handler before running harris score */ - CLFillBorderKernel _border_gy; /**< Border handler before running harris score */ - CLImage _gx; /**< Source image - Gx component */ - CLImage _gy; /**< Source image - Gy component */ - CLImage _score; /**< Source image - Harris score */ - CLImage _nonmax; /**< Source image - Non-Maxima suppressed image */ - std::vector _corners_list; /**< Array of InternalKeypoint. It stores the potential corner candidates */ - int32_t _num_corner_candidates; /**< Number of potential corner candidates */ - ICLKeyPointArray *_corners; /**< Output corners array */ + MemoryGroup _memory_group; /**< Function's memory group */ + std::unique_ptr _sobel; /**< Sobel function */ + std::unique_ptr _harris_score; /**< Harris score kernel */ + CLNonMaximaSuppression3x3 _non_max_suppr; /**< Non-maxima suppression function */ + CPPCornerCandidatesKernel _candidates; /**< Sort kernel */ + CPPSortEuclideanDistanceKernel _sort_euclidean; /**< Euclidean distance kernel */ + std::unique_ptr _border_gx; /**< Border handler before running harris score */ + std::unique_ptr _border_gy; /**< Border handler before running harris score */ + CLImage _gx; /**< Source image - Gx component */ + CLImage _gy; /**< Source image - Gy component */ + CLImage _score; /**< Source image - Harris score */ + CLImage _nonmax; /**< Source image - Non-Maxima suppressed image */ + std::vector _corners_list; /**< Array of InternalKeypoint. It stores the potential corner candidates */ + int32_t _num_corner_candidates; /**< Number of potential corner candidates */ + ICLKeyPointArray *_corners; /**< Output corners array */ }; } #endif /*ARM_COMPUTE_CLHARRISCORNERS_H */ diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h index 7fdb8a9022..164bd0a28a 100644 --- a/arm_compute/runtime/CL/functions/CLHistogram.h +++ b/arm_compute/runtime/CL/functions/CLHistogram.h @@ -24,8 +24,8 @@ #ifndef ARM_COMPUTE_CLHISTOGRAM_H #define ARM_COMPUTE_CLHISTOGRAM_H -#include "arm_compute/core/CL/kernels/CLHistogramKernel.h" #include "arm_compute/runtime/IFunction.h" +#include "src/core/CL/kernels/CLHistogramKernel.h" namespace arm_compute { diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h index d7aa11cbc8..d41f3fedf6 100644 --- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h @@ -24,11 +24,14 @@ #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYER_H #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYER_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to perform a Instance normalization. * diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h index 6b10ede650..0ecdbde8fe 100644 --- a/arm_compute/runtime/CL/functions/CLIntegralImage.h +++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h @@ -24,11 +24,15 @@ #ifndef ARM_COMPUTE_CLINTEGRALIMAGE_H #define ARM_COMPUTE_CLINTEGRALIMAGE_H -#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h" #include "arm_compute/runtime/IFunction.h" +#include + namespace arm_compute { +class CLCompileContext; +class CLIntegralImageHorKernel; +class CLIntegralImageVertKernel; class ICLTensor; /** Basic function to execute integral image. This function calls the following OpenCL kernels: @@ -42,6 +46,12 @@ class CLIntegralImage : public IFunction public: /** Default Constructor. */ CLIntegralImage(); + /** Prevent instances of this class from being copied */ + CLIntegralImage(const CLIntegralImage &) = delete; + /** Prevent instances of this class from being copied */ + CLIntegralImage &operator=(const CLIntegralImage &) = delete; + /** Default destructor */ + ~CLIntegralImage(); /** Initialise the function's source, destinations and border mode. * * @param[in] input Source tensor. Data types supported: U8. @@ -60,8 +70,8 @@ public: void run() override; protected: - CLIntegralImageHorKernel _integral_hor; /**< Integral Image Horizontal kernel */ - CLIntegralImageVertKernel _integral_vert; /**< Integral Image Vertical kernel */ + std::unique_ptr _integral_hor; /**< Integral Image Horizontal kernel */ + std::unique_ptr _integral_vert; /**< Integral Image Vertical kernel */ }; } #endif /*ARM_COMPUTE_CLINTEGRALIMAGE_H */ diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h index bc79101d9d..401d249eb4 100644 --- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h +++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h @@ -24,7 +24,6 @@ #ifndef ARM_COMPUTE_CLL2NORMALIZELAYER_H #define ARM_COMPUTE_CLL2NORMALIZELAYER_H -#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" @@ -37,7 +36,10 @@ namespace arm_compute { +class CLCompileContext; +class CLL2NormalizeLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to perform a L2 normalization on a given axis. * @@ -50,6 +52,16 @@ class CLL2NormalizeLayer : public IFunction public: /** Constructor */ CLL2NormalizeLayer(std::shared_ptr memory_manager = nullptr); + /** Default Destructor */ + ~CLL2NormalizeLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLL2NormalizeLayer(const CLL2NormalizeLayer &) = delete; + /** Default move constructor */ + CLL2NormalizeLayer(CLL2NormalizeLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLL2NormalizeLayer &operator=(const CLL2NormalizeLayer &) = delete; + /** Default move assignment operator */ + CLL2NormalizeLayer &operator=(CLL2NormalizeLayer &&) = default; /** Set the input and output tensors. * @@ -84,10 +96,10 @@ public: void run() override; private: - MemoryGroup _memory_group; - CLReductionOperation _reduce_func; - CLL2NormalizeLayerKernel _normalize_kernel; - CLTensor _sumsq; + MemoryGroup _memory_group; + CLReductionOperation _reduce_func; + std::unique_ptr _normalize_kernel; + CLTensor _sumsq; }; } #endif /*ARM_COMPUTE_CLL2NORMALIZELAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h index 1a8b33463d..017f26aa1e 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h @@ -26,8 +26,6 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" @@ -45,6 +43,10 @@ namespace arm_compute { +class CLCompileContext; +class CLCopyKernel; +class CLMemsetKernel; +class CLTransposeKernel; class ICLTensor; /** This function performs a single time step in a Long Short-Term Memory (LSTM) layer. @@ -55,6 +57,16 @@ class CLLSTMLayer : public IFunction public: /** Default constructor */ CLLSTMLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLLSTMLayer(const CLLSTMLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLLSTMLayer &operator=(const CLLSTMLayer &) = delete; + /** Prevent instances of this class to be moved */ + CLLSTMLayer(CLLSTMLayer &&) = delete; + /** Prevent instances of this class to be moved */ + CLLSTMLayer &operator=(CLLSTMLayer &&) = delete; + /** Default destructor */ + ~CLLSTMLayer(); /** Initialize function's tensors. * * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32. @@ -200,90 +212,90 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - CLFullyConnectedLayer _fully_connected_input_gate; - CLArithmeticAddition _accum_input_gate1; - CLArithmeticSubtraction _subtract_input_gate; - CLPixelWiseMultiplication _pixelwise_mul_input_gate; - CLActivationLayer _activation_input_gate; - CLFullyConnectedLayer _fully_connected_forget_gate; - CLArithmeticAddition _accum_forget_gate1; - CLPixelWiseMultiplication _pixelwise_mul_forget_gate; - CLActivationLayer _activation_forget_gate; - CLFullyConnectedLayer _fully_connected_cell_state; - CLGEMM _gemm_cell_state1; - CLTransposeKernel _transpose_cell_state; - CLArithmeticAddition _accum_cell_state1; - CLArithmeticAddition _accum_cell_state2; - CLPixelWiseMultiplication _pixelwise_mul_cell_state1; - CLActivationLayer _activation_cell_state; - CLActivationLayer _cell_clip; - CLPixelWiseMultiplication _pixelwise_mul_cell_state2; - CLFullyConnectedLayer _fully_connected_output; - CLPixelWiseMultiplication _pixelwise_mul_output_state1; - CLArithmeticAddition _accum_output1; - CLActivationLayer _activation_output; - CLActivationLayer _activation_output_state; - CLPixelWiseMultiplication _pixelwise_mul_output_state2; - CLFullyConnectedLayer _fully_connected_output_state; - CLActivationLayer _projection_clip; - CLCopyKernel _copy_cell_state; - CLCopyKernel _copy_output; - CLConcatenateLayer _concat_scratch_buffer; - CLConcatenateLayer _concat_inputs_forget_gate; - CLConcatenateLayer _concat_weights_forget_gate; - CLConcatenateLayer _concat_weights_input_gate; - CLConcatenateLayer _concat_weights_output; - CLMemsetKernel _ones_memset_kernel; - CLMeanStdDevNormalizationLayer _mean_std_norm_input_gate; - CLPixelWiseMultiplication _pixelwise_mul_input_gate_coeff; - CLArithmeticAddition _accum_input_gate_bias; - CLMeanStdDevNormalizationLayer _mean_std_norm_forget_gate; - CLPixelWiseMultiplication _pixelwise_mul_forget_gate_coeff; - CLArithmeticAddition _accum_forget_gate_bias; - CLMeanStdDevNormalizationLayer _mean_std_norm_cell_gate; - CLPixelWiseMultiplication _pixelwise_mul_cell_gate_coeff; - CLArithmeticAddition _accum_cell_gate_bias; - CLMeanStdDevNormalizationLayer _mean_std_norm_output_gate; - CLPixelWiseMultiplication _pixelwise_mul_output_gate_coeff; - CLArithmeticAddition _accum_output_gate_bias; - CLTensor _input_gate_out1; - CLTensor _input_gate_out2; - CLTensor _input_gate_out3; - CLTensor _input_gate_out4; - CLTensor _forget_gate_out1; - CLTensor _forget_gate_out2; - CLTensor _forget_gate_out3; - CLTensor _forget_gate_out4; - CLTensor _forget_gate_out5; - CLTensor _forget_gate_out6; - CLTensor _cell_state_out1; - CLTensor _cell_state_out2; - CLTensor _cell_state_out3; - CLTensor _cell_state_out4; - CLTensor _cell_state_out5; - CLTensor _output1; - CLTensor _output2; - CLTensor _output3; - CLTensor _output4; - CLTensor _cell_state_activation; - CLTensor _output_state1; - CLTensor _ones; - CLTensor _input_layer_norm_out1; - CLTensor _input_layer_norm_out2; - CLTensor _forget_layer_norm_out1; - CLTensor _forget_layer_norm_out2; - CLTensor _cell_layer_norm_out1; - CLTensor _cell_layer_norm_out2; - CLTensor _output_layer_norm_out1; - CLTensor _output_layer_norm_out2; - bool _run_peephole_opt; - bool _run_cifg_opt; - bool _perform_cell_clipping; - bool _has_projection_weights; - bool _perform_projection_clipping; - bool _is_prepared; - bool _is_layer_norm_lstm; + MemoryGroup _memory_group; + CLFullyConnectedLayer _fully_connected_input_gate; + CLArithmeticAddition _accum_input_gate1; + CLArithmeticSubtraction _subtract_input_gate; + CLPixelWiseMultiplication _pixelwise_mul_input_gate; + CLActivationLayer _activation_input_gate; + CLFullyConnectedLayer _fully_connected_forget_gate; + CLArithmeticAddition _accum_forget_gate1; + CLPixelWiseMultiplication _pixelwise_mul_forget_gate; + CLActivationLayer _activation_forget_gate; + CLFullyConnectedLayer _fully_connected_cell_state; + CLGEMM _gemm_cell_state1; + std::unique_ptr _transpose_cell_state; + CLArithmeticAddition _accum_cell_state1; + CLArithmeticAddition _accum_cell_state2; + CLPixelWiseMultiplication _pixelwise_mul_cell_state1; + CLActivationLayer _activation_cell_state; + CLActivationLayer _cell_clip; + CLPixelWiseMultiplication _pixelwise_mul_cell_state2; + CLFullyConnectedLayer _fully_connected_output; + CLPixelWiseMultiplication _pixelwise_mul_output_state1; + CLArithmeticAddition _accum_output1; + CLActivationLayer _activation_output; + CLActivationLayer _activation_output_state; + CLPixelWiseMultiplication _pixelwise_mul_output_state2; + CLFullyConnectedLayer _fully_connected_output_state; + CLActivationLayer _projection_clip; + std::unique_ptr _copy_cell_state; + std::unique_ptr _copy_output; + CLConcatenateLayer _concat_scratch_buffer; + CLConcatenateLayer _concat_inputs_forget_gate; + CLConcatenateLayer _concat_weights_forget_gate; + CLConcatenateLayer _concat_weights_input_gate; + CLConcatenateLayer _concat_weights_output; + std::unique_ptr _ones_memset_kernel; + CLMeanStdDevNormalizationLayer _mean_std_norm_input_gate; + CLPixelWiseMultiplication _pixelwise_mul_input_gate_coeff; + CLArithmeticAddition _accum_input_gate_bias; + CLMeanStdDevNormalizationLayer _mean_std_norm_forget_gate; + CLPixelWiseMultiplication _pixelwise_mul_forget_gate_coeff; + CLArithmeticAddition _accum_forget_gate_bias; + CLMeanStdDevNormalizationLayer _mean_std_norm_cell_gate; + CLPixelWiseMultiplication _pixelwise_mul_cell_gate_coeff; + CLArithmeticAddition _accum_cell_gate_bias; + CLMeanStdDevNormalizationLayer _mean_std_norm_output_gate; + CLPixelWiseMultiplication _pixelwise_mul_output_gate_coeff; + CLArithmeticAddition _accum_output_gate_bias; + CLTensor _input_gate_out1; + CLTensor _input_gate_out2; + CLTensor _input_gate_out3; + CLTensor _input_gate_out4; + CLTensor _forget_gate_out1; + CLTensor _forget_gate_out2; + CLTensor _forget_gate_out3; + CLTensor _forget_gate_out4; + CLTensor _forget_gate_out5; + CLTensor _forget_gate_out6; + CLTensor _cell_state_out1; + CLTensor _cell_state_out2; + CLTensor _cell_state_out3; + CLTensor _cell_state_out4; + CLTensor _cell_state_out5; + CLTensor _output1; + CLTensor _output2; + CLTensor _output3; + CLTensor _output4; + CLTensor _cell_state_activation; + CLTensor _output_state1; + CLTensor _ones; + CLTensor _input_layer_norm_out1; + CLTensor _input_layer_norm_out2; + CLTensor _forget_layer_norm_out1; + CLTensor _forget_layer_norm_out2; + CLTensor _cell_layer_norm_out1; + CLTensor _cell_layer_norm_out2; + CLTensor _output_layer_norm_out1; + CLTensor _output_layer_norm_out2; + bool _run_peephole_opt; + bool _run_cifg_opt; + bool _perform_cell_clipping; + bool _has_projection_weights; + bool _perform_projection_clipping; + bool _is_prepared; + bool _is_layer_norm_lstm; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLLSTMLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h index ba85c6140c..3bbf9f2c30 100644 --- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h @@ -26,10 +26,6 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h" -#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" -#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h" -#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IMemoryManager.h" @@ -39,7 +35,13 @@ namespace arm_compute { +class CLCompileContext; +class CLCol2ImKernel; +class CLIm2ColKernel; +class CLWeightsReshapeKernel; +class CLLocallyConnectedMatrixMultiplyKernel; class ICLTensor; +class ITensorInfo; /** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels: * @@ -108,16 +110,16 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - CLIm2ColKernel _input_im2col_kernel; - CLWeightsReshapeKernel _weights_reshape_kernel; - CLLocallyConnectedMatrixMultiplyKernel _mm_kernel; - CLCol2ImKernel _output_col2im_kernel; - CLTensor _input_im2col_reshaped; - CLTensor _weights_reshaped; - CLTensor _gemm_output; - bool _is_prepared; - const ICLTensor *_original_weights; + MemoryGroup _memory_group; + std::unique_ptr _input_im2col_kernel; + std::unique_ptr _weights_reshape_kernel; + std::unique_ptr _mm_kernel; + std::unique_ptr _output_col2im_kernel; + CLTensor _input_im2col_reshaped; + CLTensor _weights_reshaped; + CLTensor _gemm_output; + bool _is_prepared; + const ICLTensor *_original_weights; }; } #endif /* ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h index ad7cc778e5..6ac141641c 100644 --- a/arm_compute/runtime/CL/functions/CLMagnitude.h +++ b/arm_compute/runtime/CL/functions/CLMagnitude.h @@ -29,6 +29,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLMagnitudePhaseKernel. */ diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h index 5c8548f9e0..693862fb89 100644 --- a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h +++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h @@ -24,14 +24,19 @@ #ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H #define ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" +#include namespace arm_compute { -class ITensor; +class CLCompileContext; +class ICLTensor; +class ITensorInfo; +class CLMaxUnpoolingLayerKernel; +class CLMemsetKernel; +struct PoolingLayerInfo; /** Function to perform MaxUnpooling. This function calls the following OpenCL kernels: * @@ -43,6 +48,12 @@ class CLMaxUnpoolingLayer : public IFunction public: /** Constructor */ CLMaxUnpoolingLayer(); + /** Prevent instances of this class from being copied */ + CLMaxUnpoolingLayer(const CLMaxUnpoolingLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLMaxUnpoolingLayer &operator=(const CLMaxUnpoolingLayer &) = delete; + /** Default destructor */ + ~CLMaxUnpoolingLayer(); /** Set the input and output tensors. * * @note Output shape must be equal to the shape of the original input to pool. @@ -88,8 +99,8 @@ public: void run() override; private: - CLMemsetKernel _memset_kernel; - CLMaxUnpoolingLayerKernel _unpooling_layer_kernel; + std::unique_ptr _memset_kernel; + std::unique_ptr _unpooling_layer_kernel; }; } #endif /* ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h index be192a7c11..d9ced1393e 100644 --- a/arm_compute/runtime/CL/functions/CLMeanStdDev.h +++ b/arm_compute/runtime/CL/functions/CLMeanStdDev.h @@ -25,15 +25,20 @@ #define ARM_COMPUTE_CLMEANSTDDEV_H #include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h" #include "arm_compute/runtime/CL/functions/CLReductionOperation.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include + namespace arm_compute { +class CLCompileContext; +class ICLTensor; +class ITensorInfo; +class CLFillBorderKernel; +class CLMeanStdDevKernel; /** Basic function to execute mean and standard deviation by calling @ref CLMeanStdDevKernel */ class CLMeanStdDev : public IFunction { @@ -49,7 +54,7 @@ public: /** Allow instances of this class to be moved */ CLMeanStdDev &operator=(CLMeanStdDev &&) = default; /** Default destructor */ - ~CLMeanStdDev() = default; + ~CLMeanStdDev(); /** Initialise the kernel's inputs and outputs. * * @param[in, out] input Input image. Data types supported: U8/F16/F32. (Written to only for border filling) @@ -83,20 +88,20 @@ private: void run_float(); void run_int(); - MemoryGroup _memory_group; /**< Function's memory group */ - DataType _data_type; /**< Input data type. */ - unsigned int _num_pixels; /**< Number of image's pixels. */ - bool _run_stddev; /**< Flag for knowing if we should run stddev reduction function. */ - CLReductionOperation _reduction_operation_mean; /**< Reduction operation function for computing mean value. */ - CLReductionOperation _reduction_operation_stddev; /**< Reduction operation function for computing standard deviation. */ - CLTensor _reduction_output_mean; /**< Reduction operation output tensor for mean value. */ - CLTensor _reduction_output_stddev; /**< Reduction operation output tensor for standard deviation value. */ - float *_mean; /**< Pointer that holds the mean value. */ - float *_stddev; /**< Pointer that holds the standard deviation value. */ - CLMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */ - CLFillBorderKernel _fill_border_kernel; /**< Kernel that fills the border with zeroes. */ - cl::Buffer _global_sum; /**< Variable that holds the global sum among calls in order to ease reduction */ - cl::Buffer _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */ + MemoryGroup _memory_group; /**< Function's memory group */ + DataType _data_type; /**< Input data type. */ + unsigned int _num_pixels; /**< Number of image's pixels. */ + bool _run_stddev; /**< Flag for knowing if we should run stddev reduction function. */ + CLReductionOperation _reduction_operation_mean; /**< Reduction operation function for computing mean value. */ + CLReductionOperation _reduction_operation_stddev; /**< Reduction operation function for computing standard deviation. */ + CLTensor _reduction_output_mean; /**< Reduction operation output tensor for mean value. */ + CLTensor _reduction_output_stddev; /**< Reduction operation output tensor for standard deviation value. */ + float *_mean; /**< Pointer that holds the mean value. */ + float *_stddev; /**< Pointer that holds the standard deviation value. */ + std::unique_ptr _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */ + std::unique_ptr _fill_border_kernel; /**< Kernel that fills the border with zeroes. */ + cl::Buffer _global_sum; /**< Variable that holds the global sum among calls in order to ease reduction */ + cl::Buffer _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */ }; } #endif /*ARM_COMPUTE_CLMEANSTDDEV_H */ diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h index 1627de1ae8..cfe59eac09 100644 --- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h @@ -29,7 +29,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to execute mean and standard deviation normalization by calling @ref CLMeanStdDevNormalizationKernel */ class CLMeanStdDevNormalizationLayer : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h index 7f67f958c1..6c0458203e 100644 --- a/arm_compute/runtime/CL/functions/CLMedian3x3.h +++ b/arm_compute/runtime/CL/functions/CLMedian3x3.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute median filter. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h index 04926f7bd0..4e3f28b006 100644 --- a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h +++ b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h @@ -24,12 +24,16 @@ #ifndef ARM_COMPUTE_CLMINMAXLOCATION_H #define ARM_COMPUTE_CLMINMAXLOCATION_H -#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h" #include "arm_compute/runtime/CL/CLArray.h" #include "arm_compute/runtime/IFunction.h" +#include + namespace arm_compute { +class CLCompileContext; +class CLMinMaxKernel; +class CLMinMaxLocationKernel; class ICLTensor; using ICLImage = ICLTensor; @@ -51,6 +55,8 @@ public: CLMinMaxLocation(CLMinMaxLocation &&) = default; /** Allow instances of this class to be moved */ CLMinMaxLocation &operator=(CLMinMaxLocation &&) = default; + /** Default destructor */ + ~CLMinMaxLocation(); /** Initialise the kernel's inputs and outputs. * * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size. @@ -87,16 +93,16 @@ public: void run() override; private: - CLMinMaxKernel _min_max_kernel; /**< Kernel that performs min/max */ - CLMinMaxLocationKernel _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */ - cl::Buffer _min_max_vals; /**< Buffer to collect min, max values */ - cl::Buffer _min_max_count_vals; /**< Buffer to collect min, max values */ - void *_min; /**< Minimum value. */ - void *_max; /**< Maximum value. */ - uint32_t *_min_count; /**< Minimum value occurrences. */ - uint32_t *_max_count; /**< Maximum value occurrences. */ - CLCoordinates2DArray *_min_loc; /**< Minimum value occurrences coordinates. */ - CLCoordinates2DArray *_max_loc; /**< Maximum value occurrences coordinates. */ + std::unique_ptr _min_max_kernel; /**< Kernel that performs min/max */ + std::unique_ptr _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */ + cl::Buffer _min_max_vals; /**< Buffer to collect min, max values */ + cl::Buffer _min_max_count_vals; /**< Buffer to collect min, max values */ + void *_min; /**< Minimum value. */ + void *_max; /**< Maximum value. */ + uint32_t *_min_count; /**< Minimum value occurrences. */ + uint32_t *_max_count; /**< Maximum value occurrences. */ + CLCoordinates2DArray *_min_loc; /**< Minimum value occurrences coordinates. */ + CLCoordinates2DArray *_max_loc; /**< Maximum value occurrences coordinates. */ }; } #endif /*ARM_COMPUTE_CLMINMAXLOCATION_H */ diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h index 8b7e350e09..1b466bf662 100644 --- a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h +++ b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute non linear filter. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h index 556de1c64c..c767a042ff 100644 --- a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h +++ b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h @@ -29,6 +29,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following CL kernels: diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h index a2d46b368f..389b21e5c8 100644 --- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h @@ -24,18 +24,19 @@ #ifndef ARM_COMPUTE_CLNORMALIZATIONLAYER_H #define ARM_COMPUTE_CLNORMALIZATIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/Types.h" +#include namespace arm_compute { +class CLCompileContext; +class CLFillBorderKernel; +class CLNormalizationLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to compute a normalization layer. This function calls the following CL kernels: * @@ -48,6 +49,16 @@ class CLNormalizationLayer : public IFunction public: /** Default constructor */ CLNormalizationLayer(); + /** Prevent instances of this class from being copied */ + CLNormalizationLayer(const CLNormalizationLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLNormalizationLayer &operator=(const CLNormalizationLayer &) = delete; + /** Prevent instances of this class to be moved */ + CLNormalizationLayer(CLNormalizationLayer &&) = delete; + /** Prevent instances of this class to be moved */ + CLNormalizationLayer &operator=(CLNormalizationLayer &&) = delete; + /** Default destructor */ + ~CLNormalizationLayer(); /** Set the input and output tensors. * * @param[in, out] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -85,8 +96,8 @@ public: void run() override; private: - CLNormalizationLayerKernel _norm_kernel; /**< Normalization layer kernel to run */ - CLFillBorderKernel _border_handler; /**< Kernel to handle borders */ + std::unique_ptr _norm_kernel; /**< Normalization layer kernel to run */ + std::unique_ptr _border_handler; /**< Kernel to handle borders */ }; } #endif /* ARM_COMPUTE_CLNORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h index cf4a9b6497..de5155c65a 100644 --- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h +++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLNormalizePlanarYUVLayerKernel * diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h index adce6748c8..0e34374aa5 100644 --- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h +++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLOPTICALFLOW_H #define ARM_COMPUTE_CLOPTICALFLOW_H -#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h" - #include "arm_compute/core/IArray.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLArray.h" @@ -41,7 +39,12 @@ namespace arm_compute { +class CLCompileContext; class CLPyramid; +class CLLKTrackerInitKernel; +class CLLKTrackerStage0Kernel; +class CLLKTrackerStage1Kernel; +class CLLKTrackerFinalizeKernel; /** OpenCL Array of Internal Keypoints */ using CLLKInternalKeypointArray = CLArray; @@ -71,6 +74,8 @@ public: CLOpticalFlow(CLOpticalFlow &&) = default; /** Allow instances of this class to be moved */ CLOpticalFlow &operator=(CLOpticalFlow &&) = default; + /** Default destructor */ + ~CLOpticalFlow(); /** Initialise the function input and output * * @param[in] old_pyramid Pointer to the pyramid for the old tensor. Data types supported U8 @@ -117,22 +122,22 @@ public: void run() override; private: - MemoryGroup _memory_group; - std::vector _tracker_init_kernel; - std::vector _tracker_stage0_kernel; - std::vector _tracker_stage1_kernel; - CLLKTrackerFinalizeKernel _tracker_finalize_kernel; - std::vector _func_scharr; - std::vector _scharr_gx; - std::vector _scharr_gy; - const ICLKeyPointArray *_old_points; - const ICLKeyPointArray *_new_points_estimates; - ICLKeyPointArray *_new_points; - std::unique_ptr _old_points_internal; - std::unique_ptr _new_points_internal; - std::unique_ptr _coefficient_table; - std::unique_ptr _old_values; - size_t _num_levels; + MemoryGroup _memory_group; + std::vector> _tracker_init_kernel; + std::vector> _tracker_stage0_kernel; + std::vector> _tracker_stage1_kernel; + std::unique_ptr _tracker_finalize_kernel; + std::vector _func_scharr; + std::vector _scharr_gx; + std::vector _scharr_gy; + const ICLKeyPointArray *_old_points; + const ICLKeyPointArray *_new_points_estimates; + ICLKeyPointArray *_new_points; + std::unique_ptr _old_points_internal; + std::unique_ptr _new_points_internal; + std::unique_ptr _coefficient_table; + std::unique_ptr _old_values; + size_t _num_levels; }; } #endif /*ARM_COMPUTE_CLOPTICALFLOW_H */ diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h index ffde9ec186..ab32bccc24 100644 --- a/arm_compute/runtime/CL/functions/CLPReluLayer.h +++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h @@ -24,13 +24,14 @@ #ifndef ARM_COMPUTE_CLPRELULAYER_H #define ARM_COMPUTE_CLPRELULAYER_H -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/runtime/CL/ICLOperator.h" #include "arm_compute/runtime/IFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; namespace experimental { diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h index e3a923f81c..2bbde30fc2 100644 --- a/arm_compute/runtime/CL/functions/CLPadLayer.h +++ b/arm_compute/runtime/CL/functions/CLPadLayer.h @@ -24,13 +24,15 @@ #ifndef ARM_COMPUTE_CLPADLAYER_H #define ARM_COMPUTE_CLPADLAYER_H -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h" +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" namespace arm_compute { +class CLCompileContext; +class CLPadLayerKernel; +class CLCopyKernel; class ICLTensor; /** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels: @@ -51,6 +53,8 @@ public: CLPadLayer &operator=(const CLPadLayer &) = delete; /** Default move assignment operator */ CLPadLayer &operator=(CLPadLayer &&) = default; + /** Default destructor */ + ~CLPadLayer(); /** Initialize the function * @@ -95,9 +99,9 @@ public: private: void configure_reflect_mode(ICLTensor *input, ICLTensor *output); - CLPadLayerKernel _pad_kernel; - CLCopyKernel _copy_kernel; - bool _perform_pad; + std::unique_ptr _pad_kernel; + std::unique_ptr _copy_kernel; + bool _perform_pad; }; } // namespace arm_compute #endif /*ARM_COMPUTE_PADLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h index abc23eff0c..50e81da7c4 100644 --- a/arm_compute/runtime/CL/functions/CLPermute.h +++ b/arm_compute/runtime/CL/functions/CLPermute.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to execute an @ref CLPermuteKernel. */ class CLPermute : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h index 2731a08a52..34b8e72175 100644 --- a/arm_compute/runtime/CL/functions/CLPhase.h +++ b/arm_compute/runtime/CL/functions/CLPhase.h @@ -29,6 +29,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute an @ref CLMagnitudePhaseKernel. */ diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h index 2066012306..6432cd040d 100644 --- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h +++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h @@ -24,14 +24,16 @@ #ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H #define ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/runtime/CL/ICLOperator.h" #include "arm_compute/runtime/IFunction.h" namespace arm_compute { // Forward declaration +class CLCompileContext; +class CLFillBorderKernel; class ICLTensor; +class ITensorInfo; namespace experimental { @@ -106,7 +108,7 @@ public: void run(ITensorPack &tensors) override; private: - CLFillBorderKernel _border_handler; + std::unique_ptr _border_handler; }; /** Basic function to run @ref CLComplexPixelWiseMultiplicationKernel. */ @@ -139,7 +141,7 @@ public: void run(ITensorPack &tensors) override; private: - CLFillBorderKernel _border_handler; + std::unique_ptr _border_handler; }; } // namespace experimental diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h index 96dacf9322..ef1f426c22 100644 --- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h +++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels: * diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h index 9a78e77307..9129bfd064 100644 --- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h +++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h @@ -24,13 +24,16 @@ #ifndef ARM_COMPUTE_CLPRIORBOXLAYER_H #define ARM_COMPUTE_CLPRIORBOXLAYER_H -#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h" +#include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; +class CLPriorBoxLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLPriorBoxLayerKernel. */ class CLPriorBoxLayer : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h index 6e537680ee..a8f9221b3d 100644 --- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h @@ -24,9 +24,6 @@ #ifndef ARM_COMPUTE_CLQLSTMLAYER_H #define ARM_COMPUTE_CLQLSTMLAYER_H -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" @@ -40,7 +37,12 @@ namespace arm_compute { // Forward declarations +class CLCompileContext; +class CLCopyKernel; class ICLTensor; +class CLGEMMLowpMatrixAReductionKernel; +class CLQLSTMLayerNormalizationKernel; +class ITensorInfo; /** Basic function to run @ref CLQLSTMLayer * @@ -68,6 +70,8 @@ public: CLQLSTMLayer &operator=(const CLQLSTMLayer &) = delete; /** Default move assignment operator */ CLQLSTMLayer &operator=(CLQLSTMLayer &&) = default; + /** Default destructor */ + ~CLQLSTMLayer(); /** Initialize function's tensors. * * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED. @@ -285,72 +289,72 @@ private: }; // Functions used - CLTranspose _transpose_input_to_forget_weights{}; - CLTranspose _transpose_input_to_cell_weights{}; - CLTranspose _transpose_input_to_output_weights{}; - CLTranspose _transpose_input_to_input_weights{}; - CLTranspose _transpose_recurrent_to_forget_weights{}; - CLTranspose _transpose_recurrent_to_cell_weights{}; - CLTranspose _transpose_recurrent_to_output_weights{}; - CLTranspose _transpose_recurrent_to_input_weights{}; - CLTranspose _transpose_projection_weights{}; - CLGEMMLowpMatrixAReductionKernel _input_to_input_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_output_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{}; - CLGEMMLowpMatrixAReductionKernel _projection_reduction{}; - CLArithmeticAddition _projection_bias_add{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; - CLGEMMLowpOutputStage _input_to_forget_outstage{}; - CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; - CLGEMMLowpOutputStage _cell_to_forget_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_forget{}; - CLArithmeticAddition _accumulate_cell_forget{}; - CLActivationLayer _forget_gate_sigmoid{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; - CLGEMMLowpOutputStage _input_to_cell_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; - CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_modulation{}; - CLActivationLayer _cell_gate_tanh{}; - CLArithmeticSubtraction _input_gate_sub{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; - CLGEMMLowpOutputStage _input_to_input_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; - CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_input{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; - CLGEMMLowpOutputStage _cell_to_input_outstage{}; - CLArithmeticAddition _accumulate_cell_input{}; - CLActivationLayer _input_gate_sigmoid{}; - CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; - CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; - CLArithmeticAddition _add_forget_cell{}; - CLActivationLayer _cell_clip{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; - CLGEMMLowpOutputStage _input_to_output_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; - CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_output{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; - CLGEMMLowpOutputStage _cell_to_output_outstage{}; - CLArithmeticAddition _accumulate_cell_to_output{}; - CLActivationLayer _output_gate_sigmoid{}; - CLActivationLayer _hidden_tanh{}; - CLPixelWiseMultiplication _pixelwise_mul_hidden{}; - CLGEMMLowpOutputStage _hidden_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_projection{}; - CLGEMMLowpOutputStage _projection_outstage{}; - CLArithmeticAddition _accumulate_projection{}; - CLActivationLayer _projection_clip{}; - std::array _layer_norms{ {} }; - CLCopyKernel _copy_output{}; + CLTranspose _transpose_input_to_forget_weights{}; + CLTranspose _transpose_input_to_cell_weights{}; + CLTranspose _transpose_input_to_output_weights{}; + CLTranspose _transpose_input_to_input_weights{}; + CLTranspose _transpose_recurrent_to_forget_weights{}; + CLTranspose _transpose_recurrent_to_cell_weights{}; + CLTranspose _transpose_recurrent_to_output_weights{}; + CLTranspose _transpose_recurrent_to_input_weights{}; + CLTranspose _transpose_projection_weights{}; + std::unique_ptr _input_to_input_reduction; + std::unique_ptr _recurrent_to_input_reduction; + std::unique_ptr _input_to_forget_reduction; + std::unique_ptr _recurrent_to_forget_reduction; + std::unique_ptr _input_to_cell_reduction; + std::unique_ptr _recurrent_to_cell_reduction; + std::unique_ptr _input_to_output_reduction; + std::unique_ptr _recurrent_to_output_reduction; + std::unique_ptr _projection_reduction; + CLArithmeticAddition _projection_bias_add{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; + CLGEMMLowpOutputStage _input_to_forget_outstage{}; + CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; + CLGEMMLowpOutputStage _cell_to_forget_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_forget{}; + CLArithmeticAddition _accumulate_cell_forget{}; + CLActivationLayer _forget_gate_sigmoid{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; + CLGEMMLowpOutputStage _input_to_cell_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; + CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_modulation{}; + CLActivationLayer _cell_gate_tanh{}; + CLArithmeticSubtraction _input_gate_sub{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; + CLGEMMLowpOutputStage _input_to_input_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; + CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_input{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; + CLGEMMLowpOutputStage _cell_to_input_outstage{}; + CLArithmeticAddition _accumulate_cell_input{}; + CLActivationLayer _input_gate_sigmoid{}; + CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; + CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; + CLArithmeticAddition _add_forget_cell{}; + CLActivationLayer _cell_clip{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; + CLGEMMLowpOutputStage _input_to_output_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; + CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_output{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; + CLGEMMLowpOutputStage _cell_to_output_outstage{}; + CLArithmeticAddition _accumulate_cell_to_output{}; + CLActivationLayer _output_gate_sigmoid{}; + CLActivationLayer _hidden_tanh{}; + CLPixelWiseMultiplication _pixelwise_mul_hidden{}; + CLGEMMLowpOutputStage _hidden_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_projection{}; + CLGEMMLowpOutputStage _projection_outstage{}; + CLArithmeticAddition _accumulate_projection{}; + CLActivationLayer _projection_clip{}; + std::array, _layer_norm_count> _layer_norms; + std::unique_ptr _copy_output; TensorCopyKernel _projection_bias_copy{}; TensorCopyKernel _projection_output_to_accumulate_copy{}; @@ -402,30 +406,11 @@ private: inline CLQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g) { - return _layer_norms[getGateIndex(g)]; + return *_layer_norms[getGateIndex(g)]; } - inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in) - { - ARM_COMPUTE_ERROR_ON(!_has_layer_norm); - - CLTensor *out = &get_layer_norm_output(g); - _memory_group.manage(out); - out->allocator()->init(*(in->info())); - - get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g)); - } - - inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias) - { - // Output quantization scale will be different, but ignored here - // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; - return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); - } + inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in); + inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias); // Temporary tensors CLTensor _input_to_forget_weights_transposed{ nullptr }; diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h index e045adf5fd..a0a27c5cb4 100644 --- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h @@ -24,11 +24,14 @@ #ifndef ARM_COMPUTE_CLQUANTIZATIONLAYER_H #define ARM_COMPUTE_CLQUANTIZATIONLAYER_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to simulate a quantization layer. This function calls the following CL kernels: * diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h index 9d1cb1a724..ff3fb5449b 100644 --- a/arm_compute/runtime/CL/functions/CLRNNLayer.h +++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h @@ -24,15 +24,17 @@ #ifndef ARM_COMPUTE_CLRNN_LAYER_H #define ARM_COMPUTE_CLRNN_LAYER_H -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" +#include + namespace arm_compute { +class CLCopyKernel; class ICLTensor; /** Basic function to run @ref CLRNNLayer */ @@ -41,6 +43,12 @@ class CLRNNLayer : public IFunction public: /** Default constructor */ CLRNNLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLRNNLayer(const CLRNNLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLRNNLayer &operator=(const CLRNNLayer &) = delete; + /** Default destructor */ + ~CLRNNLayer(); /** Initialize the function * * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32 @@ -85,16 +93,16 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - CLGEMM _gemm_state_f; - CLArithmeticAddition _add_kernel; - CLActivationLayer _activation; - CLFullyConnectedLayer _fully_connected_kernel; - CLCopyKernel _copy_kernel; - CLTensor _fully_connected_out; - CLTensor _gemm_output; - CLTensor _add_output; - bool _is_prepared; + MemoryGroup _memory_group; + CLGEMM _gemm_state_f; + CLArithmeticAddition _add_kernel; + CLActivationLayer _activation; + CLFullyConnectedLayer _fully_connected_kernel; + std::unique_ptr _copy_kernel; + CLTensor _fully_connected_out; + CLTensor _gemm_output; + CLTensor _add_output; + bool _is_prepared; }; } #endif /* ARM_COMPUTE_CLRNN_LAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h index 2e78f16d6b..b4cd5560ef 100644 --- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h +++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h @@ -25,12 +25,14 @@ #define ARM_COMPUTE_CLROIALIGNLAYER_H #include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ROIPoolingLayerInfo; +class ITensorInfo; /** Basic function to run @ref CLROIAlignLayerKernel. * diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h index 30139274be..836575ef68 100644 --- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h +++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h @@ -24,14 +24,14 @@ #ifndef ARM_COMPUTE_CLROIPOOLINGLAYER_H #define ARM_COMPUTE_CLROIPOOLINGLAYER_H -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - #include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ROIPoolingLayerInfo; /** Basic function to run @ref CLROIPoolingLayerKernel. * diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h index a86cfb605d..e11e740861 100644 --- a/arm_compute/runtime/CL/functions/CLRange.h +++ b/arm_compute/runtime/CL/functions/CLRange.h @@ -29,7 +29,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLRangeKernel * diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h index 5d050d71d6..3fbcee6c21 100644 --- a/arm_compute/runtime/CL/functions/CLReductionOperation.h +++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLREDUCTIONOPERATION_H #define ARM_COMPUTE_CLREDUCTIONOPERATION_H -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" #include "arm_compute/runtime/IFunction.h" @@ -37,6 +35,9 @@ namespace arm_compute { // Forward declarations +class CLCompileContext; +class CLFillBorderKernel; +class CLReductionOperationKernel; class ICLTensor; /** Perform reduction operation. @@ -49,6 +50,16 @@ public: * @param[in] memory_manager (Optional) Memory manager. */ CLReductionOperation(std::shared_ptr memory_manager = nullptr); + /** Default Destructor */ + ~CLReductionOperation(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLReductionOperation(const CLReductionOperation &) = delete; + /** Default move constructor */ + CLReductionOperation(CLReductionOperation &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLReductionOperation &operator=(const CLReductionOperation &) = delete; + /** Default move assignment operator */ + CLReductionOperation &operator=(CLReductionOperation &&) = default; /** Set the input and output tensors. * @@ -88,15 +99,15 @@ public: private: ICLTensor *configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output); - MemoryGroup _memory_group; - std::vector _results_vector; - std::vector _reduction_kernels_vector; - std::vector _border_handlers_vector; - CLReshapeLayer _reshape; - unsigned int _num_of_stages; - unsigned int _reduction_axis; - bool _is_serial; - bool _is_reshape_required; + MemoryGroup _memory_group; + std::vector _results_vector; + std::vector> _reduction_kernels_vector; + std::vector> _border_handlers_vector; + CLReshapeLayer _reshape; + unsigned int _num_of_stages; + unsigned int _reduction_axis; + bool _is_serial; + bool _is_reshape_required; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */ \ No newline at end of file diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h index 5b110d58f4..bf5d348b3b 100644 --- a/arm_compute/runtime/CL/functions/CLRemap.h +++ b/arm_compute/runtime/CL/functions/CLRemap.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute remap. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLReorgLayer.h b/arm_compute/runtime/CL/functions/CLReorgLayer.h index a7287ce266..0840fd13fd 100644 --- a/arm_compute/runtime/CL/functions/CLReorgLayer.h +++ b/arm_compute/runtime/CL/functions/CLReorgLayer.h @@ -29,7 +29,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; class CLReorgLayer : public ICLSimpleFunction { diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h index 7fc6c3b864..b4d52ec8cf 100644 --- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h +++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h @@ -29,7 +29,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLReshapeLayerKernel */ class CLReshapeLayer : public IFunction diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h index 6b140920e9..81fa04b1f5 100644 --- a/arm_compute/runtime/CL/functions/CLReverse.h +++ b/arm_compute/runtime/CL/functions/CLReverse.h @@ -24,11 +24,14 @@ #ifndef ARM_COMPUTE_CLREVERSE_H #define ARM_COMPUTE_CLREVERSE_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLReverseKernel */ class CLReverse : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h index d776e83035..360d63ea22 100644 --- a/arm_compute/runtime/CL/functions/CLScale.h +++ b/arm_compute/runtime/CL/functions/CLScale.h @@ -32,7 +32,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLScaleKernel */ class CLScale : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h index 3892874f35..19c860f39b 100644 --- a/arm_compute/runtime/CL/functions/CLScharr3x3.h +++ b/arm_compute/runtime/CL/functions/CLScharr3x3.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute scharr 3x3 filter. This function calls the following OpenCL kernels: diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h index a1af922303..7fd52312fb 100644 --- a/arm_compute/runtime/CL/functions/CLSelect.h +++ b/arm_compute/runtime/CL/functions/CLSelect.h @@ -24,14 +24,15 @@ #ifndef ARM_COMPUTE_CLSELECT_H #define ARM_COMPUTE_CLSELECT_H -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { // Forward declarations +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLSelect */ class CLSelect : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h index 23c398cb41..f17e77236d 100644 --- a/arm_compute/runtime/CL/functions/CLSlice.h +++ b/arm_compute/runtime/CL/functions/CLSlice.h @@ -31,6 +31,8 @@ namespace arm_compute { // Forward Declarations class ICLTensor; +class CLCompileContext; +class ITensorInfo; namespace experimental { diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h index 25d4ed6895..492900da11 100644 --- a/arm_compute/runtime/CL/functions/CLSobel3x3.h +++ b/arm_compute/runtime/CL/functions/CLSobel3x3.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to execute sobel 3x3 filter. This function calls the following OpenCL kernels: @@ -42,6 +43,14 @@ class ICLTensor; class CLSobel3x3 : public ICLSimpleFunction { public: + /** Default Constructor */ + CLSobel3x3() = default; + /** Prevent instances of this class from being copied */ + CLSobel3x3(const CLSobel3x3 &) = delete; + /** Prevent instances of this class from being copied */ + CLSobel3x3 &operator=(const CLSobel3x3 &) = delete; + /** Default destructor */ + ~CLSobel3x3(); /** Initialise the function's source, destinations and border mode. * * @note At least one of output_x or output_y must be not NULL. diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h index 1f91c46f7f..a00fdd72b8 100644 --- a/arm_compute/runtime/CL/functions/CLSobel5x5.h +++ b/arm_compute/runtime/CL/functions/CLSobel5x5.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLSOBEL5X5_H #define ARM_COMPUTE_CLSOBEL5X5_H -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" @@ -37,6 +35,10 @@ namespace arm_compute { +class CLCompileContext; +class CLFillBorderKernel; +class CLSobel5x5HorKernel; +class CLSobel5x5VertKernel; class ICLTensor; /** Basic function to execute sobel 5x5 filter. This function calls the following OpenCL kernels: @@ -54,6 +56,12 @@ public: * @param[in] memory_manager (Optional) Memory manager. */ CLSobel5x5(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLSobel5x5(const CLSobel5x5 &) = delete; + /** Prevent instances of this class from being copied */ + CLSobel5x5 &operator=(const CLSobel5x5 &) = delete; + /** Default destructor */ + ~CLSobel5x5(); /** Initialise the function's source, destinations and border mode. * * @note At least one of output_x or output_y must be not NULL. @@ -82,12 +90,12 @@ public: void run() override; protected: - MemoryGroup _memory_group; /**< Function's memory group */ - CLSobel5x5HorKernel _sobel_hor; /**< Sobel Horizontal 5x5 kernel */ - CLSobel5x5VertKernel _sobel_vert; /**< Sobel Vertical 5x5 kernel */ - CLFillBorderKernel _border_handler; /**< Kernel to handle image borders */ - CLImage _tmp_x; /**< Temporary buffer for Sobel X */ - CLImage _tmp_y; /**< Temporary buffer for Sobel Y */ + MemoryGroup _memory_group; /**< Function's memory group */ + std::unique_ptr _sobel_hor; /**< Sobel Horizontal 5x5 kernel */ + std::unique_ptr _sobel_vert; /**< Sobel Vertical 5x5 kernel */ + std::unique_ptr _border_handler; /**< Kernel to handle image borders */ + CLImage _tmp_x; /**< Temporary buffer for Sobel X */ + CLImage _tmp_y; /**< Temporary buffer for Sobel Y */ }; } #endif /*ARM_COMPUTE_CLSOBEL5X5_H */ diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h index 91daf64c29..01a863b11b 100644 --- a/arm_compute/runtime/CL/functions/CLSobel7x7.h +++ b/arm_compute/runtime/CL/functions/CLSobel7x7.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLSOBEL7X7_H #define ARM_COMPUTE_CLSOBEL7X7_H -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" @@ -37,6 +35,10 @@ namespace arm_compute { +class CLCompileContext; +class CLFillBorderKernel; +class CLSobel7x7HorKernel; +class CLSobel7x7VertKernel; class ICLTensor; /** Basic function to execute sobel 7x7 filter. This function calls the following OpenCL kernels: @@ -54,6 +56,12 @@ public: * @param[in] memory_manager (Optional) Memory manager. */ CLSobel7x7(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLSobel7x7(const CLSobel7x7 &) = delete; + /** Prevent instances of this class from being copied */ + CLSobel7x7 &operator=(const CLSobel7x7 &) = delete; + /** Default destructor */ + ~CLSobel7x7(); /** Initialise the function's source, destinations and border mode. * * @note At least one of output_x or output_y must be not NULL. @@ -82,12 +90,12 @@ public: void run() override; protected: - MemoryGroup _memory_group; /**< Function's memory group */ - CLSobel7x7HorKernel _sobel_hor; /**< Sobel Horizontal 7x7 kernel */ - CLSobel7x7VertKernel _sobel_vert; /**< Sobel Vertical 7x7 kernel */ - CLFillBorderKernel _border_handler; /**< Kernel to handle image borders */ - CLImage _tmp_x; /**< Temporary buffer for Sobel X */ - CLImage _tmp_y; /**< Temporary buffer for Sobel Y */ + MemoryGroup _memory_group; /**< Function's memory group */ + std::unique_ptr _sobel_hor; /**< Sobel Horizontal 7x7 kernel */ + std::unique_ptr _sobel_vert; /**< Sobel Vertical 7x7 kernel */ + std::unique_ptr _border_handler; /**< Kernel to handle image borders */ + CLImage _tmp_x; /**< Temporary buffer for Sobel X */ + CLImage _tmp_y; /**< Temporary buffer for Sobel Y */ }; } #endif /*ARM_COMPUTE_CLSOBEL7X7_H */ diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h index fd71f3ed4d..ab10a64de4 100644 --- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h +++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h @@ -24,7 +24,6 @@ #ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H #define ARM_COMPUTE_CLSOFTMAXLAYER_H -#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLPermute.h" #include "arm_compute/runtime/IFunction.h" @@ -35,7 +34,11 @@ namespace arm_compute { +class CLCompileContext; +class CLLogits1DMaxShiftExpSumKernel; +class CLLogits1DNormKernel; class ICLTensor; +class ITensorInfo; /** Basic function to compute a SoftmaxLayer. * @@ -57,6 +60,16 @@ class CLSoftmaxLayerGeneric : public IFunction public: /** Constructor */ CLSoftmaxLayerGeneric(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied */ + CLSoftmaxLayerGeneric(const CLSoftmaxLayerGeneric &) = delete; + /** Prevent instances of this class from being copied */ + CLSoftmaxLayerGeneric &operator=(const CLSoftmaxLayerGeneric &) = delete; + /** Prevent instances of this class to be moved */ + CLSoftmaxLayerGeneric(CLSoftmaxLayerGeneric &&) = delete; + /** Prevent instances of this class to be moved */ + CLSoftmaxLayerGeneric &operator=(CLSoftmaxLayerGeneric &&) = delete; + /** Default destructor */ + ~CLSoftmaxLayerGeneric(); /** Set the input and output tensors. * * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax @@ -92,17 +105,17 @@ public: void run() override; private: - MemoryGroup _memory_group; - CLPermute _permute_input; - CLPermute _permute_output; - CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel; - CLLogits1DNormKernel _norm_kernel; - CLTensor _max; - CLTensor _sum; - CLTensor _tmp; - CLTensor _input_permuted; - CLTensor _output_permuted; - bool _needs_permute; + MemoryGroup _memory_group; + CLPermute _permute_input; + CLPermute _permute_output; + std::unique_ptr _max_shift_exp_sum_kernel; + std::unique_ptr _norm_kernel; + CLTensor _max; + CLTensor _sum; + CLTensor _tmp; + CLTensor _input_permuted; + CLTensor _output_permuted; + bool _needs_permute; }; using CLSoftmaxLayer = CLSoftmaxLayerGeneric; diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h index c6f7f11079..1611aa8ed4 100644 --- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h +++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h @@ -24,16 +24,19 @@ #ifndef ARM_COMPUTE_CLSPACETOBATCHLAYER_H #define ARM_COMPUTE_CLSPACETOBATCHLAYER_H -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" -#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" + +#include namespace arm_compute { +class CLCompileContext; +class CLMemsetKernel; +class CLSpaceToBatchLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to spatial divide a tensor. This function calls the following OpenCL kernels/functions: * @@ -54,7 +57,7 @@ public: /** Allow instances of this class to be moved */ CLSpaceToBatchLayer &operator=(CLSpaceToBatchLayer &&) = default; /** Default destructor */ - virtual ~CLSpaceToBatchLayer() = default; + ~CLSpaceToBatchLayer(); /** Set the input and output tensors. * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -121,9 +124,9 @@ public: void run() override; private: - CLSpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */ - CLMemsetKernel _memset_kernel; /**< Memset kernel to run */ - bool _has_padding; /**< Flag to check if the output has padding */ + std::unique_ptr _space_to_batch_kernel; /**< SpaceToBatch kernel to run */ + std::unique_ptr _memset_kernel; /**< Memset kernel to run */ + bool _has_padding; /**< Flag to check if the output has padding */ }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLSPACETOBATCHLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h index 24830cf4d3..9e476fe7bd 100644 --- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h +++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h @@ -24,14 +24,17 @@ #ifndef ARM_COMPUTE_CLSPACETODEPTHLAYER_H #define ARM_COMPUTE_CLSPACETODEPTHLAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h" -#include "arm_compute/core/Types.h" +#include namespace arm_compute { +class CLCompileContext; +class CLSpaceToDepthLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLSpaceToDepthLayerKernel. */ class CLSpaceToDepthLayer : public IFunction @@ -39,6 +42,16 @@ class CLSpaceToDepthLayer : public IFunction public: /** Default constructor */ CLSpaceToDepthLayer(); + /** Prevent instances of this class from being copied */ + CLSpaceToDepthLayer(const CLSpaceToDepthLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLSpaceToDepthLayer &operator=(const CLSpaceToDepthLayer &) = delete; + /** Prevent instances of this class to be moved */ + CLSpaceToDepthLayer(CLSpaceToDepthLayer &&) = delete; + /** Prevent instances of this class to be moved */ + CLSpaceToDepthLayer &operator=(CLSpaceToDepthLayer &&) = delete; + /** Default destructor */ + ~CLSpaceToDepthLayer(); /** Set the input and output tensors. * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -68,7 +81,7 @@ public: void run() override; private: - CLSpaceToDepthLayerKernel _space_to_depth_kernel; /**< CLSpaceToDepthLayerKernel to run */ + std::unique_ptr _space_to_depth_kernel; /**< CLSpaceToDepthLayerKernel to run */ }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLSPACETODEPTHLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h index 95875962c8..3861fd299a 100644 --- a/arm_compute/runtime/CL/functions/CLStackLayer.h +++ b/arm_compute/runtime/CL/functions/CLStackLayer.h @@ -27,14 +27,15 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h" - #include #include namespace arm_compute { +class CLCompileContext; +class CLStackLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to stack tensors along an axis. This function calls the following kernel: * @@ -46,6 +47,16 @@ class CLStackLayer : public IFunction public: /** Default constructor */ CLStackLayer(); + /** Prevent instances of this class from being copied */ + CLStackLayer(const CLStackLayer &) = delete; + /** Prevent instances of this class from being copied */ + CLStackLayer &operator=(const CLStackLayer &) = delete; + /** Prevent instances of this class to be moved */ + CLStackLayer(CLStackLayer &&) = delete; + /** Prevent instances of this class to be moved */ + CLStackLayer &operator=(CLStackLayer &&) = delete; + /** Default destructor */ + ~CLStackLayer(); /** Initialise the kernel's inputs vector and output. * * @note Supported input tensor rank: up to 4 @@ -84,9 +95,9 @@ public: void run() override; private: - std::vector _input; - std::vector _stack_kernels; - unsigned int _num_inputs; + std::vector _input; + std::vector> _stack_kernels; + unsigned int _num_inputs; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLSTACKLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h index 32d4b7bdf9..ca59309548 100644 --- a/arm_compute/runtime/CL/functions/CLTableLookup.h +++ b/arm_compute/runtime/CL/functions/CLTableLookup.h @@ -28,6 +28,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; class ICLLut; diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h index f3af122f0a..2c9213bd01 100644 --- a/arm_compute/runtime/CL/functions/CLThreshold.h +++ b/arm_compute/runtime/CL/functions/CLThreshold.h @@ -33,6 +33,7 @@ namespace arm_compute { // Forward declarations +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLThresholdKernel */ diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h index d2f1e9730c..69743693ff 100644 --- a/arm_compute/runtime/CL/functions/CLTile.h +++ b/arm_compute/runtime/CL/functions/CLTile.h @@ -24,13 +24,14 @@ #ifndef ARM_COMPUTE_CLTILE_H #define ARM_COMPUTE_CLTILE_H -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLTileKernel */ class CLTile : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h index 9ba7cafce4..2b7a03f23f 100644 --- a/arm_compute/runtime/CL/functions/CLTranspose.h +++ b/arm_compute/runtime/CL/functions/CLTranspose.h @@ -24,11 +24,14 @@ #ifndef ARM_COMPUTE_CLTRANSPOSE_H #define ARM_COMPUTE_CLTRANSPOSE_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to transpose a matrix on OpenCL. This function calls the following OpenCL kernel: * diff --git a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h index 07b4c8aecb..88b293069d 100644 --- a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h +++ b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h @@ -26,13 +26,17 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include + namespace arm_compute { +class CLCompileContext; +class CLUpsampleLayerKernel; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLUpsampleLayerKernel */ class CLUpsampleLayer : public IFunction @@ -49,7 +53,7 @@ public: /** Allow instances of this class to be moved */ CLUpsampleLayer &operator=(CLUpsampleLayer &&) = default; /** Default destructor */ - virtual ~CLUpsampleLayer() = default; + ~CLUpsampleLayer(); /** Initialize the function's source, destination, interpolation type and border_mode. * @@ -86,8 +90,8 @@ public: void run() override; private: - CLUpsampleLayerKernel _upsample; - ICLTensor *_output; + std::unique_ptr _upsample; + ICLTensor *_output; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLUPSAMPLELAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h index eb7c05be84..153e9bfdfc 100644 --- a/arm_compute/runtime/CL/functions/CLWarpAffine.h +++ b/arm_compute/runtime/CL/functions/CLWarpAffine.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLWarpAffineKernel for AFFINE transformation */ diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h index 2a1f78093e..5c8b5425a4 100644 --- a/arm_compute/runtime/CL/functions/CLWarpPerspective.h +++ b/arm_compute/runtime/CL/functions/CLWarpPerspective.h @@ -31,6 +31,7 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; /** Basic function to run @ref CLWarpPerspectiveKernel for PERSPECTIVE transformation */ diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h index 602f644230..9ced69c1bb 100644 --- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H #define ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H -#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h" -#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h" @@ -33,7 +31,11 @@ namespace arm_compute { +class CLCompileContext; +class CLWinogradFilterTransformKernel; +class CLWinogradOutputTransformKernel; class ICLTensor; +class ITensorInfo; /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: * @@ -56,6 +58,8 @@ public: CLWinogradConvolutionLayer &operator=(const CLWinogradConvolutionLayer &) = delete; /** Default move assignment operator */ CLWinogradConvolutionLayer &operator=(CLWinogradConvolutionLayer &&) = default; + /** Default destructor */ + ~CLWinogradConvolutionLayer(); /** Set the input and output tensors. * * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout @@ -122,16 +126,16 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - CLGEMM _batched_mm; - CLWinogradInputTransform _input_transform; - CLWinogradFilterTransformKernel _filter_transform; - CLWinogradOutputTransformKernel _output_transform; - CLTensor _input0; - CLTensor _input1; - CLTensor _batched_mm_output; - const ICLTensor *_original_weights; - bool _is_prepared; + MemoryGroup _memory_group; + CLGEMM _batched_mm; + CLWinogradInputTransform _input_transform; + std::unique_ptr _filter_transform; + std::unique_ptr _output_transform; + CLTensor _input0; + CLTensor _input1; + CLTensor _batched_mm_output; + const ICLTensor *_original_weights; + bool _is_prepared; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h index 351f88012f..8cd809cc1f 100644 --- a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h +++ b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h @@ -31,7 +31,9 @@ namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to execute a @ref CLWinogradInputTransformKernel. */ class CLWinogradInputTransform : public ICLSimpleFunction diff --git a/arm_compute/runtime/CL/functions/CLYOLOLayer.h b/arm_compute/runtime/CL/functions/CLYOLOLayer.h index 3e403f44bd..48ee4ea4f7 100644 --- a/arm_compute/runtime/CL/functions/CLYOLOLayer.h +++ b/arm_compute/runtime/CL/functions/CLYOLOLayer.h @@ -24,13 +24,14 @@ #ifndef ARM_COMPUTE_CLYOLOLAYER_H #define ARM_COMPUTE_CLYOLOLAYER_H -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to run @ref CLYOLOLayerKernel that performs a partial activation on the input * -- cgit v1.2.1