From bef7fa27b0d231a8649952f60808132d109b6345 Mon Sep 17 00:00:00 2001 From: Sang-Hoon Park Date: Wed, 21 Oct 2020 15:58:54 +0100 Subject: COMPMID-3639: (3RDPARTY_UPDATE) Move CL kernels to src Change-Id: I10d27db788e5086adae1841e3e2441cd9b76ef84 Signed-off-by: Sang-Hoon Park Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4310 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/runtime/CL/functions/CLAbsoluteDifference.cpp | 2 +- src/runtime/CL/functions/CLAccumulate.cpp | 2 +- src/runtime/CL/functions/CLActivationLayer.cpp | 2 +- src/runtime/CL/functions/CLArgMinMaxLayer.cpp | 22 +++- .../CL/functions/CLBatchNormalizationLayer.cpp | 14 +- src/runtime/CL/functions/CLBatchToSpaceLayer.cpp | 16 ++- src/runtime/CL/functions/CLBitwiseAnd.cpp | 2 +- src/runtime/CL/functions/CLBitwiseNot.cpp | 2 +- src/runtime/CL/functions/CLBitwiseOr.cpp | 2 +- src/runtime/CL/functions/CLBitwiseXor.cpp | 2 +- .../CL/functions/CLBoundingBoxTransform.cpp | 2 +- src/runtime/CL/functions/CLBox3x3.cpp | 5 +- src/runtime/CL/functions/CLCannyEdge.cpp | 30 +++-- src/runtime/CL/functions/CLCast.cpp | 2 +- src/runtime/CL/functions/CLChannelCombine.cpp | 2 +- src/runtime/CL/functions/CLChannelExtract.cpp | 2 +- src/runtime/CL/functions/CLChannelShuffleLayer.cpp | 2 +- src/runtime/CL/functions/CLColorConvert.cpp | 2 +- src/runtime/CL/functions/CLComparison.cpp | 7 +- src/runtime/CL/functions/CLComputeAllAnchors.cpp | 1 + src/runtime/CL/functions/CLConcatenateLayer.cpp | 12 +- .../functions/CLConvertFullyConnectedWeights.cpp | 2 + src/runtime/CL/functions/CLConvolution.cpp | 32 +++-- src/runtime/CL/functions/CLConvolutionLayer.cpp | 3 +- src/runtime/CL/functions/CLCopy.cpp | 2 +- src/runtime/CL/functions/CLCropResize.cpp | 6 + src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 1 - .../CL/functions/CLDeconvolutionLayerUpsample.cpp | 17 ++- src/runtime/CL/functions/CLDepthConvertLayer.cpp | 2 +- src/runtime/CL/functions/CLDepthToSpaceLayer.cpp | 2 +- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 52 ++++---- src/runtime/CL/functions/CLDequantizationLayer.cpp | 2 +- src/runtime/CL/functions/CLDerivative.cpp | 5 +- src/runtime/CL/functions/CLDilate.cpp | 5 +- .../CL/functions/CLDirectConvolutionLayer.cpp | 21 +-- .../CL/functions/CLDirectDeconvolutionLayer.cpp | 6 + .../CL/functions/CLElementWiseUnaryLayer.cpp | 2 +- .../CL/functions/CLElementwiseOperations.cpp | 2 +- src/runtime/CL/functions/CLEqualizeHistogram.cpp | 24 +++- src/runtime/CL/functions/CLErode.cpp | 5 +- src/runtime/CL/functions/CLFFT1D.cpp | 30 +++-- src/runtime/CL/functions/CLFFT2D.cpp | 5 + src/runtime/CL/functions/CLFFTConvolutionLayer.cpp | 7 + src/runtime/CL/functions/CLFastCorners.cpp | 20 +-- src/runtime/CL/functions/CLFill.cpp | 2 +- src/runtime/CL/functions/CLFillBorder.cpp | 2 +- src/runtime/CL/functions/CLFlattenLayer.cpp | 2 +- src/runtime/CL/functions/CLFloor.cpp | 2 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 13 ++ .../CL/functions/CLFuseBatchNormalization.cpp | 10 +- src/runtime/CL/functions/CLGEMM.cpp | 142 ++++++++++++++------- .../CL/functions/CLGEMMConvolutionLayer.cpp | 47 +++++-- .../CL/functions/CLGEMMDeconvolutionLayer.cpp | 25 +++- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 80 +++++++----- src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 9 +- src/runtime/CL/functions/CLGather.cpp | 2 +- src/runtime/CL/functions/CLGaussian3x3.cpp | 5 +- src/runtime/CL/functions/CLGaussian5x5.cpp | 24 ++-- src/runtime/CL/functions/CLGaussianPyramid.cpp | 47 ++++--- .../CL/functions/CLGenerateProposalsLayer.cpp | 57 +++++---- src/runtime/CL/functions/CLHOGDescriptor.cpp | 22 +++- src/runtime/CL/functions/CLHOGDetector.cpp | 11 +- src/runtime/CL/functions/CLHOGGradient.cpp | 15 ++- src/runtime/CL/functions/CLHOGMultiDetection.cpp | 21 ++- src/runtime/CL/functions/CLHarrisCorners.cpp | 26 ++-- .../CL/functions/CLInstanceNormalizationLayer.cpp | 4 +- src/runtime/CL/functions/CLIntegralImage.cpp | 16 ++- src/runtime/CL/functions/CLL2NormalizeLayer.cpp | 16 ++- src/runtime/CL/functions/CLLSTMLayer.cpp | 57 ++++++--- src/runtime/CL/functions/CLLSTMLayerQuantized.cpp | 8 ++ src/runtime/CL/functions/CLLaplacianPyramid.cpp | 3 + .../CL/functions/CLLaplacianReconstruct.cpp | 2 + .../CL/functions/CLLocallyConnectedLayer.cpp | 35 +++-- src/runtime/CL/functions/CLMagnitude.cpp | 2 +- src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp | 17 ++- src/runtime/CL/functions/CLMeanStdDev.cpp | 18 ++- .../functions/CLMeanStdDevNormalizationLayer.cpp | 2 +- src/runtime/CL/functions/CLMedian3x3.cpp | 5 +- src/runtime/CL/functions/CLMinMaxLocation.cpp | 17 ++- src/runtime/CL/functions/CLNonLinearFilter.cpp | 5 +- .../CL/functions/CLNonMaximaSuppression3x3.cpp | 7 +- src/runtime/CL/functions/CLNormalizationLayer.cpp | 17 ++- .../CL/functions/CLNormalizePlanarYUVLayer.cpp | 2 +- src/runtime/CL/functions/CLOpticalFlow.cpp | 38 +++--- src/runtime/CL/functions/CLPReluLayer.cpp | 2 +- src/runtime/CL/functions/CLPadLayer.cpp | 17 ++- src/runtime/CL/functions/CLPermute.cpp | 2 +- src/runtime/CL/functions/CLPhase.cpp | 2 +- .../CL/functions/CLPixelWiseMultiplication.cpp | 15 ++- src/runtime/CL/functions/CLPoolingLayer.cpp | 5 +- src/runtime/CL/functions/CLPriorBoxLayer.cpp | 4 +- src/runtime/CL/functions/CLQLSTMLayer.cpp | 91 ++++++++++--- src/runtime/CL/functions/CLQuantizationLayer.cpp | 2 +- src/runtime/CL/functions/CLRNNLayer.cpp | 24 +++- src/runtime/CL/functions/CLROIAlignLayer.cpp | 3 +- src/runtime/CL/functions/CLROIPoolingLayer.cpp | 4 +- src/runtime/CL/functions/CLRange.cpp | 2 +- src/runtime/CL/functions/CLReduceMean.cpp | 3 +- src/runtime/CL/functions/CLReductionOperation.cpp | 42 ++++-- src/runtime/CL/functions/CLRemap.cpp | 5 +- src/runtime/CL/functions/CLReorgLayer.cpp | 2 +- src/runtime/CL/functions/CLReshapeLayer.cpp | 2 +- src/runtime/CL/functions/CLReverse.cpp | 2 +- src/runtime/CL/functions/CLScale.cpp | 5 +- src/runtime/CL/functions/CLScharr3x3.cpp | 5 +- src/runtime/CL/functions/CLSelect.cpp | 2 +- src/runtime/CL/functions/CLSlice.cpp | 2 +- src/runtime/CL/functions/CLSobel3x3.cpp | 7 +- src/runtime/CL/functions/CLSobel5x5.cpp | 33 +++-- src/runtime/CL/functions/CLSobel7x7.cpp | 33 +++-- src/runtime/CL/functions/CLSoftmaxLayer.cpp | 30 +++-- src/runtime/CL/functions/CLSpaceToBatchLayer.cpp | 21 ++- src/runtime/CL/functions/CLSpaceToDepthLayer.cpp | 10 +- src/runtime/CL/functions/CLStackLayer.cpp | 11 +- src/runtime/CL/functions/CLStridedSlice.cpp | 2 +- src/runtime/CL/functions/CLTableLookup.cpp | 2 +- src/runtime/CL/functions/CLThreshold.cpp | 2 +- src/runtime/CL/functions/CLTile.cpp | 2 +- src/runtime/CL/functions/CLTranspose.cpp | 2 +- src/runtime/CL/functions/CLUpsampleLayer.cpp | 10 +- src/runtime/CL/functions/CLWarpAffine.cpp | 5 +- src/runtime/CL/functions/CLWarpPerspective.cpp | 5 +- .../CL/functions/CLWinogradConvolutionLayer.cpp | 23 +++- .../CL/functions/CLWinogradInputTransform.cpp | 5 +- src/runtime/CL/functions/CLYOLOLayer.cpp | 2 +- 125 files changed, 1106 insertions(+), 563 deletions(-) (limited to 'src/runtime/CL/functions') diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp index d5d1bbdd7a..b7f40a516c 100644 --- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp +++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h" -#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h" +#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp index 2f06252446..742de64e34 100644 --- a/src/runtime/CL/functions/CLAccumulate.cpp +++ b/src/runtime/CL/functions/CLAccumulate.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLAccumulate.h" -#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h" +#include "src/core/CL/kernels/CLAccumulateKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index 5ddf227382..61c82b33eb 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLRuntimeContext.h" +#include "src/core/CL/kernels/CLActivationLayerKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp index 57c4f685f6..5fc849e3c5 100644 --- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp +++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp @@ -30,8 +30,10 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/CL/CLValidate.h" +#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" +#include "support/MemorySupport.h" namespace arm_compute { @@ -40,6 +42,8 @@ CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr memory_manage { } +CLArgMinMaxLayer::~CLArgMinMaxLayer() = default; + Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -124,13 +128,19 @@ void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); // Configure reduction operation kernels - _reduction_kernels_vector.resize(_num_of_stages); + _reduction_kernels_vector.reserve(_num_of_stages); + + auto add_reduction_kernel = [this, &compile_context, axis, op](const ICLTensor * input, const ICLTensor * prev_output, ICLTensor * output) + { + _reduction_kernels_vector.emplace_back(support::cpp14::make_unique()); + _reduction_kernels_vector.back()->configure(compile_context, input, prev_output, output, axis, op); + }; _memory_group.manage(&_not_reshaped_output); // Create temporary tensors if(_num_of_stages == 1) { - _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op); + add_reduction_kernel(input, nullptr, &_not_reshaped_output); } else { @@ -144,19 +154,19 @@ void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const // Apply ReductionOperation only on first kernel _memory_group.manage(&_results_vector[0]); - _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op); + add_reduction_kernel(input, nullptr, &_results_vector[0]); // Apply ReductionOperation on intermediate stages for(unsigned int i = 1; i < _num_of_stages - 1; ++i) { _memory_group.manage(&_results_vector[i]); - _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op); + add_reduction_kernel(input, &_results_vector[i - 1], &_results_vector[i]); _results_vector[i - 1].allocator()->allocate(); } // Apply ReductionOperation on the last stage const unsigned int last_stage = _num_of_stages - 1; - _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op); + add_reduction_kernel(input, &_results_vector[last_stage - 1], &_not_reshaped_output); _results_vector[last_stage - 1].allocator()->allocate(); } _reshape.configure(compile_context, &_not_reshaped_output, output); @@ -169,7 +179,7 @@ void CLArgMinMaxLayer::run() for(unsigned int i = 0; i < _num_of_stages; ++i) { - CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); + CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false); } _reshape.run(); } diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index 701add074e..77eed1140f 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -29,14 +29,19 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/MemorySupport.h" -using namespace arm_compute; +#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" +namespace arm_compute +{ CLBatchNormalizationLayer::CLBatchNormalizationLayer() - : _norm_kernel() + : _norm_kernel(support::cpp14::make_unique()) { } +CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default; + void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon, ActivationLayerInfo act_info) { @@ -47,7 +52,7 @@ void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_contex const ICLTensor *gamma, float epsilon, ActivationLayerInfo act_info) { - _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); + _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); } Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, @@ -60,5 +65,6 @@ Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITens void CLBatchNormalizationLayer::run() { - CLScheduler::get().enqueue(_norm_kernel, true); + CLScheduler::get().enqueue(*_norm_kernel, true); } +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp index 5ba3b5bc9c..e0a2c430ed 100644 --- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp @@ -30,13 +30,18 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -using namespace arm_compute; +#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" +#include "support/MemorySupport.h" +namespace arm_compute +{ CLBatchToSpaceLayer::CLBatchToSpaceLayer() - : _batch_to_space_kernel() + : _batch_to_space_kernel(support::cpp14::make_unique()) { } +CLBatchToSpaceLayer::~CLBatchToSpaceLayer() = default; + void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); @@ -44,7 +49,7 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) { - _batch_to_space_kernel.configure(compile_context, input, block_shape, output); + _batch_to_space_kernel->configure(compile_context, input, block_shape, output); } void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) @@ -54,7 +59,7 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_ void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output) { - _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output); + _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output); } Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) @@ -69,5 +74,6 @@ Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_sha void CLBatchToSpaceLayer::run() { - CLScheduler::get().enqueue(_batch_to_space_kernel, true); + CLScheduler::get().enqueue(*_batch_to_space_kernel, true); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp index cb49e61e84..cfcd63f170 100644 --- a/src/runtime/CL/functions/CLBitwiseAnd.cpp +++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h" -#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h" +#include "src/core/CL/kernels/CLBitwiseAndKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp index 22c575ca8d..588c793f6a 100644 --- a/src/runtime/CL/functions/CLBitwiseNot.cpp +++ b/src/runtime/CL/functions/CLBitwiseNot.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h" -#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h" +#include "src/core/CL/kernels/CLBitwiseNotKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp index 4bbb8909fe..3a5de193a3 100644 --- a/src/runtime/CL/functions/CLBitwiseOr.cpp +++ b/src/runtime/CL/functions/CLBitwiseOr.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h" -#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h" +#include "src/core/CL/kernels/CLBitwiseOrKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp index bc37f6eaab..62aeaaa31f 100644 --- a/src/runtime/CL/functions/CLBitwiseXor.cpp +++ b/src/runtime/CL/functions/CLBitwiseXor.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h" -#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h" +#include "src/core/CL/kernels/CLBitwiseXorKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp index 2384fc4132..600d36290c 100644 --- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp +++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h" -#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h" +#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp index 0300899b59..be40f25055 100644 --- a/src/runtime/CL/functions/CLBox3x3.cpp +++ b/src/runtime/CL/functions/CLBox3x3.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLBox3x3.h" -#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLBox3x3Kernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" #include @@ -41,5 +42,5 @@ void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *inp auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp index cd2d6b478a..5a32564d2d 100644 --- a/src/runtime/CL/functions/CLCannyEdge.cpp +++ b/src/runtime/CL/functions/CLCannyEdge.cpp @@ -31,6 +31,10 @@ #include "arm_compute/runtime/CL/functions/CLSobel3x3.h" #include "arm_compute/runtime/CL/functions/CLSobel5x5.h" #include "arm_compute/runtime/CL/functions/CLSobel7x7.h" +#include "src/core/CL/kernels/CLCannyEdgeKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLSobel5x5Kernel.h" +#include "src/core/CL/kernels/CLSobel7x7Kernel.h" #include "support/MemorySupport.h" using namespace arm_compute; @@ -38,10 +42,10 @@ using namespace arm_compute; CLCannyEdge::CLCannyEdge(std::shared_ptr memory_manager) // NOLINT : _memory_group(std::move(memory_manager)), _sobel(), - _gradient(), - _border_mag_gradient(), - _non_max_suppr(), - _edge_trace(), + _gradient(support::cpp14::make_unique()), + _border_mag_gradient(support::cpp14::make_unique()), + _non_max_suppr(support::cpp14::make_unique()), + _edge_trace(support::cpp14::make_unique()), _gx(), _gy(), _mag(), @@ -55,6 +59,8 @@ CLCannyEdge::CLCannyEdge(std::shared_ptr memory_manager) // NOLI { } +CLCannyEdge::~CLCannyEdge() = default; + void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value) { @@ -143,7 +149,7 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor * _memory_group.manage(&_phase); // Configure gradient - _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type); + _gradient->configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type); // Allocate intermediate buffers _gx.allocator()->allocate(); @@ -153,14 +159,14 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor * _memory_group.manage(&_nonmax); // Configure non-maxima suppression - _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED); + _non_max_suppr->configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED); // Allocate intermediate buffers _phase.allocator()->allocate(); // Fill border around magnitude image as non-maxima suppression will access // it. If border mode is undefined filling the border is a nop. - _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value); + _border_mag_gradient->configure(compile_context, &_mag, _non_max_suppr->border_size(), border_mode, constant_border_value); // Allocate intermediate buffers _mag.allocator()->allocate(); @@ -172,7 +178,7 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor * _memory_group.manage(&_l1_list_counter); // Configure edge tracing - _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter); + _edge_trace->configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter); // Allocate intermediate buffers _visited.allocator()->allocate(); @@ -190,14 +196,14 @@ void CLCannyEdge::run() _sobel->run(); // Run phase and magnitude calculation - CLScheduler::get().enqueue(_gradient, false); + CLScheduler::get().enqueue(*_gradient, false); // Fill border before non-maxima suppression. Nop for border mode undefined. - CLScheduler::get().enqueue(_border_mag_gradient, false); + CLScheduler::get().enqueue(*_border_mag_gradient, false); // Run non max suppresion _nonmax.clear(CLScheduler::get().queue()); - CLScheduler::get().enqueue(_non_max_suppr, false); + CLScheduler::get().enqueue(*_non_max_suppr, false); // Clear temporary structures and run edge trace _output->clear(CLScheduler::get().queue()); @@ -205,5 +211,5 @@ void CLCannyEdge::run() _recorded.clear(CLScheduler::get().queue()); _l1_list_counter.clear(CLScheduler::get().queue()); _l1_stack.clear(CLScheduler::get().queue()); - CLScheduler::get().enqueue(_edge_trace, true); + CLScheduler::get().enqueue(*_edge_trace, true); } diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp index 95cc0e9239..2a28e06845 100644 --- a/src/runtime/CL/functions/CLCast.cpp +++ b/src/runtime/CL/functions/CLCast.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLCast.h" -#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp index 326caa8c74..e93aea31f4 100644 --- a/src/runtime/CL/functions/CLChannelCombine.cpp +++ b/src/runtime/CL/functions/CLChannelCombine.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLChannelCombine.h" -#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h" +#include "src/core/CL/kernels/CLChannelCombineKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp index aa37af9988..8b4a3f7458 100644 --- a/src/runtime/CL/functions/CLChannelExtract.cpp +++ b/src/runtime/CL/functions/CLChannelExtract.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLChannelExtract.h" -#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h" +#include "src/core/CL/kernels/CLChannelExtractKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp index b79afdb3b4..c443df3b37 100644 --- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp +++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h" -#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp index 2bbb30e24c..95f4257929 100644 --- a/src/runtime/CL/functions/CLColorConvert.cpp +++ b/src/runtime/CL/functions/CLColorConvert.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLColorConvert.h" -#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h" +#include "src/core/CL/kernels/CLColorConvertKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp index 8c18b35583..9b5840aa95 100644 --- a/src/runtime/CL/functions/CLComparison.cpp +++ b/src/runtime/CL/functions/CLComparison.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLComparison.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLComparisonKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLComparisonKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" namespace arm_compute @@ -47,7 +48,7 @@ void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor if(broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } @@ -76,7 +77,7 @@ void CLComparisonStatic::configure(const CLCompileContext &compile_context, if(broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp index be86fc4f78..2cae0ee455 100644 --- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp +++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h" +#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" #include "support/MemorySupport.h" diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index 2eb310b893..54f71f9765 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -23,19 +23,19 @@ */ #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h" -#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h" -#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h" -#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h" -#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h" +#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h" +#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h" +#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h" +#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "support/MemorySupport.h" diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp index b291ae5b88..8ecc114343 100644 --- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp +++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp @@ -22,6 +22,8 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" +#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp index bc962d0052..1ad32d309c 100644 --- a/src/runtime/CL/functions/CLConvolution.cpp +++ b/src/runtime/CL/functions/CLConvolution.cpp @@ -24,7 +24,6 @@ #include "arm_compute/runtime/CL/functions/CLConvolution.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" @@ -32,6 +31,8 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/core/CL/kernels/CLConvolutionKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" #include @@ -49,15 +50,20 @@ void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTen auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } template CLConvolutionSquare::CLConvolutionSquare(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() + : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(support::cpp14::make_unique>()), + _kernel_vert(support::cpp14::make_unique>()), _kernel(support::cpp14::make_unique>()), + _border_handler(support::cpp14::make_unique()) { } +template +CLConvolutionSquare::~CLConvolutionSquare() = default; + template void CLConvolutionSquare::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) @@ -88,35 +94,35 @@ void CLConvolutionSquare::configure(const CLCompileContext &compile scale = calculate_matrix_scale(conv, matrix_size); } - _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second); - _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _kernel_hor->configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); + _kernel_vert->configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second); + _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value)); // Allocate intermediate buffer _tmp.allocator()->allocate(); } else { - _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value)); + _kernel->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } } template void CLConvolutionSquare::run() { - CLScheduler::get().enqueue(_border_handler); + CLScheduler::get().enqueue(*_border_handler); if(_is_separable) { MemoryGroupResourceScope scope_mg(_memory_group); - CLScheduler::get().enqueue(_kernel_hor, false); - CLScheduler::get().enqueue(_kernel_vert); + CLScheduler::get().enqueue(*_kernel_hor, false); + CLScheduler::get().enqueue(*_kernel_vert); } else { - CLScheduler::get().enqueue(_kernel); + CLScheduler::get().enqueue(*_kernel); } } @@ -135,5 +141,5 @@ void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index 85355f0f17..e214bdf0f2 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -29,7 +29,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" - #include "support/MemorySupport.h" #include @@ -45,6 +44,8 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_ma { } +CLConvolutionLayer::~CLConvolutionLayer() = default; + void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) { diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp index acdc52d4f7..f7b016a779 100644 --- a/src/runtime/CL/functions/CLCopy.cpp +++ b/src/runtime/CL/functions/CLCopy.cpp @@ -24,11 +24,11 @@ #include "arm_compute/runtime/CL/functions/CLCopy.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "src/core/CL/kernels/CLCopyKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp index 4cf9f13a67..4aaa674c5c 100644 --- a/src/runtime/CL/functions/CLCropResize.cpp +++ b/src/runtime/CL/functions/CLCropResize.cpp @@ -25,6 +25,10 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLCopyKernel.h" +#include "src/core/CL/kernels/CLCropKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLMemsetKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -61,6 +65,8 @@ CLCropResize::CLCropResize() { } +CLCropResize::~CLCropResize() = default; + Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output, Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) { diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index e6717b6d01..6fe231ea6c 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -28,7 +28,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" - #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index eb1fb7fbdf..0cf2ea623f 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -27,16 +27,21 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" +#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" +#include "src/core/CL/kernels/CLMemsetKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT - : _upsample(), - _memset(), + : _upsample(support::cpp14::make_unique()), + _memset(support::cpp14::make_unique()), _output(nullptr) { } +CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default; + Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) { return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info); @@ -52,13 +57,13 @@ void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_con ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _output = output; - _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); - _upsample.configure(compile_context, input, _output, info); + _memset->configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); + _upsample->configure(compile_context, input, _output, info); } void CLDeconvolutionLayerUpsample::run() { - CLScheduler::get().enqueue(_memset, false); - CLScheduler::get().enqueue(_upsample, true); + CLScheduler::get().enqueue(*_memset, false); + CLScheduler::get().enqueue(*_upsample, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp index 141eb3fefc..e58c0e5f4c 100644 --- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" -#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp index 8571056104..8dbd974ceb 100644 --- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h" -#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h" +#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index bb0db2e7a7..2440384e3b 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -24,13 +24,19 @@ #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" +#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h" +#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" +#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h" +#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" +#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h" #include "support/MemorySupport.h" namespace arm_compute @@ -119,7 +125,7 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), - _dwc_native_kernel(), + _dwc_native_kernel(support::cpp14::make_unique()), _permute_input_to_nhwc(), _permute_weights_to_nhwc(), _permute_output_to_nchw(), @@ -137,6 +143,8 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv { } +CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default; + void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { @@ -206,9 +214,9 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1; DWCKernelInfo dwc_info; dwc_info.activation_info = act_info; - _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, - dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, - output_multipliers_to_use, output_shifts_to_use); + _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, + dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + output_multipliers_to_use, output_shifts_to_use); if(_needs_permute) { @@ -302,7 +310,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run() { _permute_input_to_nhwc.run(); } - CLScheduler::get().enqueue(_dwc_native_kernel); + CLScheduler::get().enqueue(*_dwc_native_kernel); if(_needs_permute) { _permute_output_to_nchw.run(); @@ -343,11 +351,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _kernel(nullptr), - _border_handler(), + _border_handler(support::cpp14::make_unique()), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), - _reshape_weights(), + _reshape_weights(support::cpp14::make_unique()), _permuted_input(), _permuted_weights(), _permuted_output(), @@ -378,14 +386,14 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - conv_info, - depth_multiplier, - act_info, - gpu_target, - dilation)); + weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), + conv_info, + depth_multiplier, + act_info, + gpu_target, + dilation)); const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); @@ -434,7 +442,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config { if(_needs_weights_reshape) { - _reshape_weights.configure(compile_context, weights, &_permuted_weights, info); + _reshape_weights->configure(compile_context, weights, &_permuted_weights, info); weights_to_use = &_permuted_weights; } _kernel = arm_compute::support::cpp14::make_unique(); @@ -486,7 +494,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config { zero_value = PixelValue(static_cast(input->info()->quantization_info().uniform().offset)); } - _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value); + _border_handler->configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value); } Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, @@ -505,7 +513,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run() { _permute_input_to_nchw.run(); } - CLScheduler::get().enqueue(_border_handler); + CLScheduler::get().enqueue(*_border_handler); CLScheduler::get().enqueue(*_kernel); if(_needs_permute) @@ -547,7 +555,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepar ARM_COMPUTE_ERROR_ON(_needs_permute); ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); _permuted_weights.allocator()->allocate(); - CLScheduler::get().enqueue(_reshape_weights); + CLScheduler::get().enqueue(*_reshape_weights); _original_weights->mark_as_unused(); } _is_prepared = true; @@ -567,7 +575,7 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, + unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { const GPUTarget gpu_target = CLScheduler::get().target(); diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp index 66ac58ef95..6d63463906 100644 --- a/src/runtime/CL/functions/CLDequantizationLayer.cpp +++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" -#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h" +#include "src/core/CL/kernels/CLDequantizationLayerKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp index 7138281f87..a2b883ad28 100644 --- a/src/runtime/CL/functions/CLDerivative.cpp +++ b/src/runtime/CL/functions/CLDerivative.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLDerivative.h" -#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLDerivativeKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" #include @@ -41,5 +42,5 @@ void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp index 27acf9f7cc..c3d5f8845f 100644 --- a/src/runtime/CL/functions/CLDilate.cpp +++ b/src/runtime/CL/functions/CLDilate.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLDilate.h" -#include "arm_compute/core/CL/kernels/CLDilateKernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLDilateKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" #include @@ -41,5 +42,5 @@ void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *inp auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 07e7a18941..bff882c28b 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -24,19 +24,24 @@ #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; CLDirectConvolutionLayer::CLDirectConvolutionLayer() - : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false) + : _direct_conv_kernel(support::cpp14::make_unique()), _input_border_handler(support::cpp14::make_unique()), _activationlayer_function(), + _is_activationlayer_enabled(false) { } +CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default; + void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); @@ -47,10 +52,10 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context const ActivationLayerInfo &act_info) { // Set GPU target - _direct_conv_kernel.set_target(CLScheduler::get().target()); + _direct_conv_kernel->set_target(CLScheduler::get().target()); // Configure direct convolution - _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info); + _direct_conv_kernel->configure(compile_context, input, weights, biases, output, conv_info); // Configure border handler PixelValue &&zero_value(0.f); @@ -58,10 +63,10 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context { zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); } - _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value); + _input_border_handler->configure(compile_context, input, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value); // Tune kernels - CLScheduler::get().tune_kernel_static(_direct_conv_kernel); + CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); _is_activationlayer_enabled = act_info.enabled(); @@ -86,10 +91,10 @@ Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITenso void CLDirectConvolutionLayer::run() { // Run border handler - CLScheduler::get().enqueue(_input_border_handler, false); + CLScheduler::get().enqueue(*_input_border_handler, false); // Run direct convolution - CLScheduler::get().enqueue(_direct_conv_kernel); + CLScheduler::get().enqueue(*_direct_conv_kernel); //Run Activation Layer if(_is_activationlayer_enabled) diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index 0ffafa0221..0e3109439e 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -23,11 +23,17 @@ */ #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "src/core/CL/kernels/CLMemsetKernel.h" +#include "src/core/CL/kernels/CLWeightsReshapeKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp index de94255b48..35ed97d381 100644 --- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp +++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h" -#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h" +#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 7b4d3c629d..736cf973a1 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -24,8 +24,8 @@ #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLElementwiseOperationKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp index a1158a71a5..cc927a055b 100644 --- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp +++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp @@ -28,6 +28,9 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLHistogramKernel.h" +#include "src/core/CL/kernels/CLTableLookupKernel.h" +#include "support/MemorySupport.h" #include #include @@ -83,10 +86,17 @@ void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_di } // namespace CLEqualizeHistogram::CLEqualizeHistogram() - : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8) + : _histogram_kernel(support::cpp14::make_unique()), + _border_histogram_kernel(support::cpp14::make_unique()), + _map_histogram_kernel(support::cpp14::make_unique()), + _hist(nr_bins, 0, max_range), + _cum_dist(nr_bins, 0, max_range), + _cd_lut(nr_bins, DataType::U8) { } +CLEqualizeHistogram::~CLEqualizeHistogram() = default; + void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output) { configure(CLKernelLibrary::get().get_compile_context(), input, output); @@ -94,22 +104,22 @@ void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output) void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output) { - _histogram_kernel.configure(compile_context, input, &_hist); - _border_histogram_kernel.configure(compile_context, input, &_hist); - _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output); + _histogram_kernel->configure(compile_context, input, &_hist); + _border_histogram_kernel->configure(compile_context, input, &_hist); + _map_histogram_kernel->configure(compile_context, input, &_cd_lut, output); } void CLEqualizeHistogram::run() { // Calculate histogram of input. - CLScheduler::get().enqueue(_histogram_kernel, false); + CLScheduler::get().enqueue(*_histogram_kernel, false); // Calculate remaining pixels when image is not multiple of the elements of histogram kernel - CLScheduler::get().enqueue(_border_histogram_kernel, false); + CLScheduler::get().enqueue(*_border_histogram_kernel, false); // Calculate cumulative distribution of histogram and create LUT. calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut); // Map input to output using created LUT. - CLScheduler::get().enqueue(_map_histogram_kernel); + CLScheduler::get().enqueue(*_map_histogram_kernel); } diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp index 5236f620f1..6880c4845a 100644 --- a/src/runtime/CL/functions/CLErode.cpp +++ b/src/runtime/CL/functions/CLErode.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLErode.h" -#include "arm_compute/core/CL/kernels/CLErodeKernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLErodeKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" #include @@ -41,5 +42,5 @@ void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *inpu auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp index 1269cba90d..a0078689ff 100644 --- a/src/runtime/CL/functions/CLFFT1D.cpp +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -26,15 +26,28 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "src/core/CL/kernels/CLFFTRadixStageKernel.h" +#include "src/core/CL/kernels/CLFFTScaleKernel.h" #include "src/core/utils/helpers/fft.h" +#include "support/MemorySupport.h" namespace arm_compute { CLFFT1D::CLFFT1D(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false) + : _memory_group(std::move(memory_manager)), + _digit_reverse_kernel(support::cpp14::make_unique()), + _fft_kernels(), + _scale_kernel(support::cpp14::make_unique()), + _digit_reversed_input(), + _digit_reverse_indices(), + _num_ffts(0), + _run_scale(false) { } +CLFFT1D::~CLFFT1D() = default; + void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) { configure(CLKernelLibrary::get().get_compile_context(), input, output, config); @@ -62,12 +75,12 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; _num_ffts = decomposed_vector.size(); - _fft_kernels.resize(_num_ffts); + _fft_kernels.reserve(_num_ffts); for(unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -77,7 +90,8 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor fft_kernel_info.radix = radix_for_stage; fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); - _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels.emplace_back(support::cpp14::make_unique()); + _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); Nx *= radix_for_stage; } @@ -88,7 +102,7 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor FFTScaleKernelInfo scale_config; scale_config.scale = static_cast(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -132,18 +146,18 @@ void CLFFT1D::run() MemoryGroupResourceScope scope_mg(_memory_group); // Run digit reverse - CLScheduler::get().enqueue(_digit_reverse_kernel, false); + CLScheduler::get().enqueue(*_digit_reverse_kernel, false); // Run radix kernels for(unsigned int i = 0; i < _num_ffts; ++i) { - CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); + CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); } // Run output scaling if(_run_scale) { - CLScheduler::get().enqueue(_scale_kernel, true); + CLScheduler::get().enqueue(*_scale_kernel, true); } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp index 7ab852fa98..1d444bb15d 100644 --- a/src/runtime/CL/functions/CLFFT2D.cpp +++ b/src/runtime/CL/functions/CLFFT2D.cpp @@ -26,6 +26,9 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "src/core/CL/kernels/CLFFTRadixStageKernel.h" +#include "src/core/CL/kernels/CLFFTScaleKernel.h" namespace arm_compute { @@ -34,6 +37,8 @@ CLFFT2D::CLFFT2D(std::shared_ptr memory_manager) { } +CLFFT2D::~CLFFT2D() = default; + void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) { configure(CLKernelLibrary::get().get_compile_context(), input, output, config); diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp index 4d0eab81ee..5472e8469f 100644 --- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp @@ -29,6 +29,13 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CPP/CPPScheduler.h" +#include "src/core/CL/kernels/CLCopyKernel.h" +#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "src/core/CL/kernels/CLFFTRadixStageKernel.h" +#include "src/core/CL/kernels/CLFFTScaleKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp index 97f853fdea..110d2c3639 100644 --- a/src/runtime/CL/functions/CLFastCorners.cpp +++ b/src/runtime/CL/functions/CLFastCorners.cpp @@ -24,12 +24,14 @@ #include "arm_compute/runtime/CL/functions/CLFastCorners.h" #include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/core/CL/kernels/CLFastCornersKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "support/MemorySupport.h" #include #include @@ -38,9 +40,9 @@ using namespace arm_compute; CLFastCorners::CLFastCorners(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), - _fast_corners_kernel(), + _fast_corners_kernel(support::cpp14::make_unique()), _suppr_func(), - _copy_array_kernel(), + _copy_array_kernel(support::cpp14::make_unique()), _output(), _suppr(), _win(), @@ -52,6 +54,8 @@ CLFastCorners::CLFastCorners(std::shared_ptr memory_manager) { } +CLFastCorners::~CLFastCorners() = default; + void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value) { @@ -78,11 +82,11 @@ void CLFastCorners::configure(const CLCompileContext &compile_context, const ICL const bool update_number = (nullptr != _num_corners); _memory_group.manage(&_output); - _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode); + _fast_corners_kernel->configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode); if(!_non_max) { - _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer); + _copy_array_kernel->configure(compile_context, &_output, update_number, _corners, &_num_buffer); } else { @@ -90,7 +94,7 @@ void CLFastCorners::configure(const CLCompileContext &compile_context, const ICL _memory_group.manage(&_suppr); _suppr_func.configure(compile_context, &_output, &_suppr, border_mode); - _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer); + _copy_array_kernel->configure(compile_context, &_suppr, update_number, _corners, &_num_buffer); _suppr.allocator()->allocate(); } @@ -113,14 +117,14 @@ void CLFastCorners::run() q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer); } - CLScheduler::get().enqueue(_fast_corners_kernel, false); + CLScheduler::get().enqueue(*_fast_corners_kernel, false); if(_non_max) { _suppr_func.run(); } - CLScheduler::get().enqueue(_copy_array_kernel, false); + CLScheduler::get().enqueue(*_copy_array_kernel, false); unsigned int get_num_corners = 0; q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners); diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp index a89383ec31..855ed8380a 100644 --- a/src/runtime/CL/functions/CLFill.cpp +++ b/src/runtime/CL/functions/CLFill.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLFill.h" -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLMemsetKernel.h" #include "support/MemorySupport.h" diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp index c647bb6a02..27d132b842 100644 --- a/src/runtime/CL/functions/CLFillBorder.cpp +++ b/src/runtime/CL/functions/CLFillBorder.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLFillBorder.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index a826541017..0646a0d3a0 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" -#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFlattenLayerKernel.h" #include "support/MemorySupport.h" using namespace arm_compute; diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp index 7ed92ac3df..770e6a3781 100644 --- a/src/runtime/CL/functions/CLFloor.cpp +++ b/src/runtime/CL/functions/CLFloor.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLFloor.h" -#include "arm_compute/core/CL/kernels/CLFloorKernel.h" +#include "src/core/CL/kernels/CLFloorKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 75e87c382b..1796443ca5 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -28,6 +28,19 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "src/core/CL/kernels/CLTransposeKernel.h" #include "support/Cast.h" #include "support/MemorySupport.h" diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp index 825267c0fc..f018e5a8ae 100644 --- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp +++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp @@ -28,14 +28,18 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLFuseBatchNormalization::CLFuseBatchNormalization() - : _fuse_bn_kernel() + : _fuse_bn_kernel(support::cpp14::make_unique()) { } +CLFuseBatchNormalization::~CLFuseBatchNormalization() = default; + void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, @@ -49,7 +53,7 @@ void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, float epsilon, FuseBatchNormalizationType fbn_type) { - _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, @@ -62,6 +66,6 @@ Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, cons void CLFuseBatchNormalization::run() { - CLScheduler::get().enqueue(_fuse_bn_kernel, true); + CLScheduler::get().enqueue(*_fuse_bn_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 80c5496ede..0151485849 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLGEMM.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/GPUTarget.h" @@ -38,6 +39,11 @@ #include "src/core/CL/ICLGEMMKernelConfiguration.h" #include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h" #include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/float_ops.h" #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" @@ -51,16 +57,58 @@ using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::cl_gemm; using namespace arm_compute::utils::cast; +namespace weights_transformations +{ +CLGEMMReshapeRHSMatrixKernelManaged::CLGEMMReshapeRHSMatrixKernelManaged() + : _kernel(support::cpp14::make_unique()) +{ +} + +CLGEMMReshapeRHSMatrixKernelManaged::~CLGEMMReshapeRHSMatrixKernelManaged() = default; + +void CLGEMMReshapeRHSMatrixKernelManaged::run() +{ + _output.allocator()->allocate(); + CLScheduler::get().enqueue(*_kernel, false); + _reshape_run = true; +} + +void CLGEMMReshapeRHSMatrixKernelManaged::release() +{ + _output.allocator()->free(); +} + +ICLTensor *CLGEMMReshapeRHSMatrixKernelManaged::get_weights() +{ + return &_output; +} + +uint32_t CLGEMMReshapeRHSMatrixKernelManaged::uid() +{ + return _uid; +} + +void CLGEMMReshapeRHSMatrixKernelManaged::configure(const ICLTensor *input, GEMMRHSMatrixInfo info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, info); +} + +void CLGEMMReshapeRHSMatrixKernelManaged::configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info) +{ + _kernel->configure(compile_context, input, &_output, info); +} +} // namespace weights_transformations + CLGEMM::CLGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), - _mm_kernel(), - _reshape_lhs_kernel(), - _reshape_rhs_kernel(), - _reshape_rhs_kernel_managed(), - _mm_reshaped_kernel(), - _mm_reshaped_only_rhs_kernel(), - _mm_reshaped_only_rhs_fallback_kernel(), + _mm_kernel(support::cpp14::make_unique()), + _reshape_lhs_kernel(support::cpp14::make_unique()), + _reshape_rhs_kernel(support::cpp14::make_unique()), + _reshape_rhs_kernel_managed(support::cpp14::make_unique()), + _mm_reshaped_kernel(support::cpp14::make_unique()), + _mm_reshaped_only_rhs_kernel(support::cpp14::make_unique()), + _mm_reshaped_only_rhs_fallback_kernel(support::cpp14::make_unique()), _tmp_a(), _tmp_b(), _original_b(nullptr), @@ -73,6 +121,8 @@ CLGEMM::CLGEMM(std::shared_ptr memory_manager, IWeightsManager * { } +CLGEMM::~CLGEMM() = default; + CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool reshape_b_only_on_first_run) { std::unique_ptr gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target()); @@ -98,15 +148,15 @@ void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const const GPUTarget gpu_target = CLScheduler::get().target(); // Set the target for the kernels - _mm_kernel.set_target(gpu_target); + _mm_kernel->set_target(gpu_target); GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias()); // Configure and tune matrix multiply kernel - _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); + _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); // Tune kernel statically - CLScheduler::get().tune_kernel_static(_mm_kernel); + CLScheduler::get().tune_kernel_static(*_mm_kernel); } void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, @@ -122,8 +172,8 @@ void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, cons int mult_interleave4x4_height = 1; // Set the target for the kernels - _reshape_lhs_kernel.set_target(gpu_target); - _mm_kernel.set_target(gpu_target); + _reshape_lhs_kernel->set_target(gpu_target); + _mm_kernel->set_target(gpu_target); if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) { @@ -158,24 +208,24 @@ void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, cons } // Configure interleave kernel - _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d); + _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d); // Configure transpose kernel ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) { - _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); - reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); + _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info); + reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get())); } else { - _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); } // Configure and tune matrix multiply kernel - _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); + _mm_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); - CLScheduler::get().tune_kernel_static(_mm_kernel); + CLScheduler::get().tune_kernel_static(*_mm_kernel); // Allocate intermediate tensors _tmp_a.allocator()->allocate(); @@ -209,8 +259,8 @@ void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, cons kernel_info.activation_info = gemm_info.activation_info(); // Set the target for the kernels - _reshape_lhs_kernel.set_target(gpu_target); - _mm_kernel.set_target(gpu_target); + _reshape_lhs_kernel->set_target(gpu_target); + _mm_kernel->set_target(gpu_target); const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b)); @@ -234,21 +284,21 @@ void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, cons // Configure lhs_info and rhs_info std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); - _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); + _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) { - _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); - reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); + _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info); + reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get())); } else { - _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); } // Configure and tune matrix multiply kernel - _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); // Allocate intermediate tensors _tmp_a.allocator()->allocate(); @@ -282,7 +332,7 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context kernel_info.activation_info = gemm_info.activation_info(); // Set the target for the kernels - _mm_kernel.set_target(gpu_target); + _mm_kernel->set_target(gpu_target); const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b)); @@ -305,12 +355,12 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) { - _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info); - reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed)); + _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info); + reshaped_rhs = utils::cast::polymorphic_downcast(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get())); } else { - _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); } // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true) @@ -319,11 +369,11 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context // Configure matrix multiply kernel with no y padding support kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); // Configure matrix multiply kernel with y padding support kernel_info.has_pad_y = true; - _mm_reshaped_only_rhs_fallback_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); if(!_reshape_b_only_on_first_run && use_mm_b) { @@ -626,49 +676,49 @@ void CLGEMM::run() { case CLGEMMKernelType::NATIVE_V1: { - CLScheduler::get().enqueue(_mm_kernel, true); + CLScheduler::get().enqueue(*_mm_kernel, true); break; } case CLGEMMKernelType::RESHAPED_V1: { // Run interleave kernel - CLScheduler::get().enqueue(_reshape_lhs_kernel, false); + CLScheduler::get().enqueue(*_reshape_lhs_kernel, false); if(!_reshape_b_only_on_first_run) { // Run transpose kernel if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); + _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get()); } else { - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); + CLScheduler::get().enqueue(*_reshape_rhs_kernel, false); } } - CLScheduler::get().enqueue(_mm_kernel, true); + CLScheduler::get().enqueue(*_mm_kernel, true); break; } case CLGEMMKernelType::RESHAPED: { // Run interleave kernel - CLScheduler::get().enqueue(_reshape_lhs_kernel, false); + CLScheduler::get().enqueue(*_reshape_lhs_kernel, false); if(!_reshape_b_only_on_first_run) { // Run transpose kernel if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); + _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get()); } else { - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); + CLScheduler::get().enqueue(*_reshape_rhs_kernel, false); } } - CLScheduler::get().enqueue(_mm_reshaped_kernel, true); + CLScheduler::get().enqueue(*_mm_reshaped_kernel, true); break; } case CLGEMMKernelType::RESHAPED_ONLY_RHS: @@ -678,20 +728,20 @@ void CLGEMM::run() // Run transpose kernel if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); + _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get()); } else { - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); + CLScheduler::get().enqueue(*_reshape_rhs_kernel, false); } } if(_has_pad_y) { - CLScheduler::get().enqueue(_mm_reshaped_only_rhs_fallback_kernel, true); + CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_fallback_kernel, true); } else { - CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true); + CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, true); } break; } @@ -720,13 +770,13 @@ void CLGEMM::prepare() { if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) { - _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed); + _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get()); } else { // Run transpose kernel and mark original weights tensor as unused _tmp_b.allocator()->allocate(); - CLScheduler::get().enqueue(_reshape_rhs_kernel, false); + CLScheduler::get().enqueue(*_reshape_rhs_kernel, false); _original_b->mark_as_unused(); } } diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index e871b39805..4d26df5e43 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -30,8 +30,23 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLCol2ImKernel.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "src/core/CL/kernels/CLIm2ColKernel.h" +#include "src/core/CL/kernels/CLWeightsReshapeKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "support/Cast.h" +#include "support/MemorySupport.h" #include #include @@ -43,10 +58,12 @@ using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::cast; CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights() - : _weights_reshape_kernel() + : _weights_reshape_kernel(support::cpp14::make_unique()) { } +CLConvolutionLayerReshapeWeights::~CLConvolutionLayerReshapeWeights() = default; + void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups) { configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups); @@ -64,7 +81,7 @@ void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr; - _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups); + _weights_reshape_kernel->configure(compile_context, weights, biases_to_use, output, num_groups); output->info()->set_quantization_info(weights->info()->quantization_info()); } @@ -96,16 +113,18 @@ Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, co void CLConvolutionLayerReshapeWeights::run() { - CLScheduler::get().enqueue(_weights_reshape_kernel); + CLScheduler::get().enqueue(*_weights_reshape_kernel); } CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager, weights_manager), - _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), - _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false) + : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(support::cpp14::make_unique()), + _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(support::cpp14::make_unique()), _activationlayer_function(), _original_weights(nullptr), + _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false) { } +CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default; + void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, const ActivationLayerInfo &act_info) @@ -230,8 +249,8 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, _fuse_activation = true; // Set the GPU target for im2col and col2im - _im2col_kernel.set_target(CLScheduler::get().target()); - _col2im_kernel.set_target(CLScheduler::get().target()); + _im2col_kernel->set_target(CLScheduler::get().target()); + _col2im_kernel->set_target(CLScheduler::get().target()); const ICLTensor *gemm_input_to_use = input; ICLTensor *gemm_output_to_use = output; @@ -293,11 +312,11 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, _memory_group.manage(&_im2col_output); // Configure and tune im2col. im2col output shape is auto-initialized - _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups); + _im2col_kernel->configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups); // Set quantization info _im2col_output.info()->set_quantization_info(input->info()->quantization_info()); - CLScheduler::get().tune_kernel_static(_im2col_kernel); + CLScheduler::get().tune_kernel_static(*_im2col_kernel); // Update GEMM input gemm_input_to_use = &_im2col_output; @@ -390,8 +409,8 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, if(!_skip_col2im) { // Configure and tune Col2Im - _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups); - CLScheduler::get().tune_kernel_static(_col2im_kernel); + _col2im_kernel->configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups); + CLScheduler::get().tune_kernel_static(*_col2im_kernel.get()); } if(!_skip_col2im) @@ -611,7 +630,7 @@ void CLGEMMConvolutionLayer::run() // Run im2col if(!_skip_im2col) { - CLScheduler::get().enqueue(_im2col_kernel); + CLScheduler::get().enqueue(*_im2col_kernel); } // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions @@ -629,7 +648,7 @@ void CLGEMMConvolutionLayer::run() // Reshape output matrix if(!_skip_col2im) { - CLScheduler::get().enqueue(_col2im_kernel, false); + CLScheduler::get().enqueue(*_col2im_kernel.get(), false); } //Run Activation Layer if we cannot fuse in GEMM diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index 5fc9c17bef..4d277f0982 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -28,8 +28,23 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "src/core/CL/kernels/CLIm2ColKernel.h" +#include "src/core/CL/kernels/CLWeightsReshapeKernel.h" +#include "support/MemorySupport.h" -#include #include namespace arm_compute @@ -99,7 +114,7 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr()), _slice_gemm(), _gemmlowp_final(), _reshaped_weights(), @@ -116,6 +131,8 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptrinfo(), weights->info(), deconv_info); + _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); _gemm_output.allocator()->allocate(); if(_is_quantized) @@ -357,7 +374,7 @@ void CLGEMMDeconvolutionLayer::run() _mm_gemm.run(); } - CLScheduler::get().enqueue(_deconv_reshape, false); + CLScheduler::get().enqueue(*_deconv_reshape, false); if(_is_quantized) { diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 7a8de6c1f5..d3d80a39e3 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -35,8 +35,16 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h" #include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" +#include "support/MemorySupport.h" namespace arm_compute { @@ -71,14 +79,14 @@ inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, Dat CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), - _weights_to_qasymm8(), - _mm_native_kernel(), - _mm_reshaped_only_rhs_kernel(), - _mtx_b_reshape_kernel(), - _mtx_a_reduction_kernel(), - _mtx_b_reduction_kernel(), - _offset_contribution_kernel(), - _offset_contribution_output_stage_kernel(), + _weights_to_qasymm8(support::cpp14::make_unique()), + _mm_native_kernel(support::cpp14::make_unique()), + _mm_reshaped_only_rhs_kernel(support::cpp14::make_unique()), + _mtx_b_reshape_kernel(support::cpp14::make_unique()), + _mtx_a_reduction_kernel(support::cpp14::make_unique()), + _mtx_b_reduction_kernel(support::cpp14::make_unique()), + _offset_contribution_kernel(support::cpp14::make_unique()), + _offset_contribution_output_stage_kernel(support::cpp14::make_unique()), _qasymm8_weights(), _vector_sum_col(), _vector_sum_row(), @@ -100,6 +108,8 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrset_target(gpu_target); + _mm_reshaped_only_rhs_kernel->set_target(gpu_target); GEMMRHSMatrixInfo rhs_info; GEMMLHSMatrixInfo lhs_info; @@ -150,7 +160,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con TensorInfo weights_info(*b->info()); weights_info.set_data_type(DataType::QASYMM8); _qasymm8_weights.allocator()->init(weights_info); - _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0); + _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0); } const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; @@ -168,7 +178,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure reshape RHS kernel - _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); } // Using default reduction info @@ -185,7 +195,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -196,7 +206,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _memory_group.manage(&_vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info); + _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info); } GEMMKernelInfo gemm_kernel_info; @@ -226,8 +236,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } else { @@ -237,7 +247,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped) { - _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); } else { @@ -245,11 +255,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure matrix multiply kernel - _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, - a->info()->dimension(0), - _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, + a->info()->dimension(0), + _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); _mm_result_s32.allocator()->allocate(); } } @@ -270,7 +280,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info); } else { @@ -278,12 +288,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure matrix multiply kernel - _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } // Configure offset contribution kernel - _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, - _b_offset); + _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, + _b_offset); } // Allocate tensors @@ -489,40 +499,40 @@ void CLGEMMLowpMatrixMultiplyCore::run() if(!_reshape_b_only_on_first_run) { // Run reshape matrix B - CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); + CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false); } } // Run matrix B reduction kernel only if _a_offset is not equal to 0 if(_a_offset != 0 && !_reshape_b_only_on_first_run) { - CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); + CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); } // Run matrix A reduction kernel only if _b_offset is not equal to 0 if(_b_offset != 0) { - CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false); + CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false); } // Run matrix multiply if(_is_gemm_reshaped) { - CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false); + CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false); } else { - CLScheduler::get().enqueue(_mm_native_kernel, false); + CLScheduler::get().enqueue(*_mm_native_kernel, false); } if(_run_output_stage) { // Run offset contribution/output stage kernel - CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true); + CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true); } if(_run_offset_contribution) { // Run offset contribution kernel - CLScheduler::get().enqueue(_offset_contribution_kernel, true); + CLScheduler::get().enqueue(*_offset_contribution_kernel, true); } } @@ -533,7 +543,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() if(_convert_to_qasymm8) { _qasymm8_weights.allocator()->allocate(); - CLScheduler::get().enqueue(_weights_to_qasymm8, false); + CLScheduler::get().enqueue(*_weights_to_qasymm8, false); } if(_is_gemm_reshaped && _reshape_b_only_on_first_run) @@ -542,7 +552,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() // Run reshape kernel and mark original weights tensor as unused _tmp_b.allocator()->allocate(); - CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); + CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false); _original_b->mark_as_unused(); } @@ -550,7 +560,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() if(_a_offset != 0 && _reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); - CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); + CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); } CLScheduler::get().queue().finish(); diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index 28f397fd8b..f9c5247d2d 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -24,11 +24,14 @@ #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" +#include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" #include "support/MemorySupport.h" +#include + namespace arm_compute { void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp index d9b6679ebf..de6296f6a3 100644 --- a/src/runtime/CL/functions/CLGather.cpp +++ b/src/runtime/CL/functions/CLGather.cpp @@ -24,7 +24,7 @@ #include "arm_compute/runtime/CL/functions/CLGather.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGatherKernel.h" +#include "src/core/CL/kernels/CLGatherKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp index c62e200315..97db9ba06d 100644 --- a/src/runtime/CL/functions/CLGaussian3x3.cpp +++ b/src/runtime/CL/functions/CLGaussian3x3.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h" -#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGaussian3x3Kernel.h" #include "support/MemorySupport.h" #include @@ -41,5 +42,5 @@ void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp index 1fe2fddfb6..f7470d4ecf 100644 --- a/src/runtime/CL/functions/CLGaussian5x5.cpp +++ b/src/runtime/CL/functions/CLGaussian5x5.cpp @@ -24,22 +24,30 @@ #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" +#include "support/MemorySupport.h" #include using namespace arm_compute; CLGaussian5x5::CLGaussian5x5(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp() + : _memory_group(std::move(memory_manager)), + _kernel_hor(support::cpp14::make_unique()), + _kernel_vert(support::cpp14::make_unique()), + _border_handler(support::cpp14::make_unique()), + _tmp() { } +CLGaussian5x5::~CLGaussian5x5() = default; + void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); @@ -55,9 +63,9 @@ void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor _memory_group.manage(&_tmp); // Configure kernels - _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED); - _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED); - _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _kernel_hor->configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED); + _kernel_vert->configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED); + _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value)); // Allocate intermediate buffers _tmp.allocator()->allocate(); @@ -65,10 +73,10 @@ void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor void CLGaussian5x5::run() { - CLScheduler::get().enqueue(_border_handler, false); + CLScheduler::get().enqueue(*_border_handler, false); MemoryGroupResourceScope scope_mg(_memory_group); - CLScheduler::get().enqueue(_kernel_hor, false); - CLScheduler::get().enqueue(_kernel_vert); + CLScheduler::get().enqueue(*_kernel_hor, false); + CLScheduler::get().enqueue(*_kernel_vert); } diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp index 297d535ba5..66b85352c1 100644 --- a/src/runtime/CL/functions/CLGaussianPyramid.cpp +++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp @@ -24,19 +24,21 @@ #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h" -#include "arm_compute/core/CL/kernels/CLScaleKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" - #include "arm_compute/runtime/CL/CLPyramid.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" +#include "src/core/CL/kernels/CLGaussianPyramidKernel.h" +#include "src/core/CL/kernels/CLScaleKernel.h" +#include "support/MemorySupport.h" #include @@ -47,6 +49,8 @@ CLGaussianPyramid::CLGaussianPyramid() { } +CLGaussianPyramid::~CLGaussianPyramid() = default; + CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT : _horizontal_border_handler(), _vertical_border_handler(), @@ -55,6 +59,8 @@ CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT { } +CLGaussianPyramidHalf::~CLGaussianPyramidHalf() = default; + void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value); @@ -80,10 +86,10 @@ void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, I if(num_levels > 1) { - _horizontal_border_handler.resize(num_levels - 1); - _vertical_border_handler.resize(num_levels - 1); - _horizontal_reduction.resize(num_levels - 1); - _vertical_reduction.resize(num_levels - 1); + _horizontal_border_handler.reserve(num_levels - 1); + _vertical_border_handler.reserve(num_levels - 1); + _horizontal_reduction.reserve(num_levels - 1); + _vertical_reduction.reserve(num_levels - 1); // Apply half scale to the X dimension of the tensor shape TensorShape tensor_shape = pyramid->info()->tensor_shape(); @@ -95,16 +101,20 @@ void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, I for(size_t i = 0; i < num_levels - 1; ++i) { /* Configure horizontal kernel */ - _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i)); + _horizontal_reduction.emplace_back(support::cpp14::make_unique()); + _horizontal_reduction.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i)); /* Configure vertical kernel */ - _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1)); + _vertical_reduction.emplace_back(support::cpp14::make_unique()); + _vertical_reduction.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1)); /* Configure border */ - _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value)); + _horizontal_border_handler.emplace_back(support::cpp14::make_unique()); + _horizontal_border_handler.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction.back()->border_size(), border_mode, PixelValue(constant_border_value)); /* Configure border */ - _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16)); + _vertical_border_handler.emplace_back(support::cpp14::make_unique()); + _vertical_border_handler.back()->configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction.back()->border_size(), border_mode, PixelValue(pixel_value_u16)); } _tmp.allocate(); } @@ -127,10 +137,10 @@ void CLGaussianPyramidHalf::run() for(unsigned int i = 0; i < num_levels - 1; ++i) { - CLScheduler::get().enqueue(_horizontal_border_handler[i], false); - CLScheduler::get().enqueue(_horizontal_reduction[i], false); - CLScheduler::get().enqueue(_vertical_border_handler[i], false); - CLScheduler::get().enqueue(_vertical_reduction[i], false); + CLScheduler::get().enqueue(*_horizontal_border_handler[i], false); + CLScheduler::get().enqueue(*_horizontal_reduction[i], false); + CLScheduler::get().enqueue(*_vertical_border_handler[i], false); + CLScheduler::get().enqueue(*_vertical_reduction[i], false); } } @@ -163,7 +173,7 @@ void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, IC if(num_levels > 1) { _gauss5x5.resize(num_levels - 1); - _scale_nearest.resize(num_levels - 1); + _scale_nearest.reserve(num_levels - 1); PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8); @@ -175,7 +185,8 @@ void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, IC _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value); /* Configure scale image kernel */ - _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER }); + _scale_nearest.emplace_back(support::cpp14::make_unique()); + _scale_nearest.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER }); } _tmp.allocate(); @@ -199,6 +210,6 @@ void CLGaussianPyramidOrb::run() for(unsigned int i = 0; i < num_levels - 1; ++i) { _gauss5x5[i].run(); - CLScheduler::get().enqueue(_scale_nearest[i]); + CLScheduler::get().enqueue(*_scale_nearest[i]); } } diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp index 5291de074a..87bf39030a 100644 --- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp +++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp @@ -25,22 +25,29 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" +#include "src/core/CL/kernels/CLDequantizationLayerKernel.h" +#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" +#include "src/core/CL/kernels/CLPermuteKernel.h" +#include "src/core/CL/kernels/CLQuantizationLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "support/MemorySupport.h" namespace arm_compute { CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), - _permute_deltas_kernel(), + _permute_deltas_kernel(support::cpp14::make_unique()), _flatten_deltas(), - _permute_scores_kernel(), + _permute_scores_kernel(support::cpp14::make_unique()), _flatten_scores(), - _compute_anchors_kernel(), - _bounding_box_kernel(), - _pad_kernel(), - _dequantize_anchors(), - _dequantize_deltas(), - _quantize_all_proposals(), + _compute_anchors_kernel(support::cpp14::make_unique()), + _bounding_box_kernel(support::cpp14::make_unique()), + _pad_kernel(support::cpp14::make_unique()), + _dequantize_anchors(support::cpp14::make_unique()), + _dequantize_deltas(support::cpp14::make_unique()), + _quantize_all_proposals(support::cpp14::make_unique()), _cpp_nms(memory_manager), _is_nhwc(false), _is_qasymm8(false), @@ -62,6 +69,8 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptrconfigure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); @@ -102,7 +111,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context if(!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas_kernel->configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -119,7 +128,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context if(!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores_kernel->configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -137,18 +146,18 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _memory_group.manage(&_all_anchors_f32); _memory_group.manage(&_deltas_flattened_f32); // Dequantize anchors to float - _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32); + _dequantize_anchors->configure(compile_context, &_all_anchors, &_all_anchors_f32); _all_anchors.allocator()->allocate(); anchors_to_use = &_all_anchors_f32; // Dequantize deltas to float - _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32); + _dequantize_deltas->configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32); _deltas_flattened.allocator()->allocate(); deltas_to_use = &_deltas_flattened_f32; } // Bounding box transform _memory_group.manage(&_all_proposals); BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f); - _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); + _bounding_box_kernel->configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info); deltas_to_use->allocator()->allocate(); anchors_to_use->allocator()->allocate(); @@ -158,7 +167,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); - _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized); + _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; } @@ -193,7 +202,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); _proposals_4_roi_values.allocator()->allocate(); } @@ -343,34 +352,34 @@ void CLGenerateProposalsLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Compute all the anchors - CLScheduler::get().enqueue(_compute_anchors_kernel, false); + CLScheduler::get().enqueue(*_compute_anchors_kernel, false); // Transpose and reshape the inputs if(!_is_nhwc) { - CLScheduler::get().enqueue(_permute_deltas_kernel, false); - CLScheduler::get().enqueue(_permute_scores_kernel, false); + CLScheduler::get().enqueue(*_permute_deltas_kernel, false); + CLScheduler::get().enqueue(*_permute_scores_kernel, false); } _flatten_deltas.run(); _flatten_scores.run(); if(_is_qasymm8) { - CLScheduler::get().enqueue(_dequantize_anchors, false); - CLScheduler::get().enqueue(_dequantize_deltas, false); + CLScheduler::get().enqueue(*_dequantize_anchors, false); + CLScheduler::get().enqueue(*_dequantize_deltas, false); } // Build the boxes - CLScheduler::get().enqueue(_bounding_box_kernel, false); + CLScheduler::get().enqueue(*_bounding_box_kernel, false); if(_is_qasymm8) { - CLScheduler::get().enqueue(_quantize_all_proposals, false); + CLScheduler::get().enqueue(*_quantize_all_proposals, false); } // Non maxima suppression run_cpp_nms_kernel(); // Add dummy batch indexes - CLScheduler::get().enqueue(_pad_kernel, true); + CLScheduler::get().enqueue(*_pad_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp index 21fa6690ea..80026532ab 100644 --- a/src/runtime/CL/functions/CLHOGDescriptor.cpp +++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp @@ -28,14 +28,26 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLHOGDescriptorKernel.h" +#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space() + : _memory_group(std::move(memory_manager)), + _gradient(), + _orient_bin(support::cpp14::make_unique()), + _block_norm(support::cpp14::make_unique()), + _mag(), + _phase(), + _hog_space() { } +CLHOGDescriptor::~CLHOGDescriptor() = default; + void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value); @@ -87,10 +99,10 @@ void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTens _memory_group.manage(&_hog_space); // Initialise orientation binning kernel - _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info()); + _orient_bin->configure(compile_context, &_mag, &_phase, &_hog_space, hog->info()); // Initialize HOG norm kernel - _block_norm.configure(compile_context, &_hog_space, output, hog->info()); + _block_norm->configure(compile_context, &_hog_space, output, hog->info()); // Allocate intermediate tensors _mag.allocator()->allocate(); @@ -106,8 +118,8 @@ void CLHOGDescriptor::run() _gradient.run(); // Run orientation binning - CLScheduler::get().enqueue(_orient_bin, false); + CLScheduler::get().enqueue(*_orient_bin, false); // Run block normalization - CLScheduler::get().enqueue(_block_norm); + CLScheduler::get().enqueue(*_block_norm); } \ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp index 9188f654dc..07ae8151c0 100644 --- a/src/runtime/CL/functions/CLHOGDetector.cpp +++ b/src/runtime/CL/functions/CLHOGDetector.cpp @@ -23,19 +23,22 @@ */ #include "arm_compute/runtime/CL/functions/CLHOGDetector.h" -#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLHOGDetectorKernel.h" +#include "support/MemorySupport.h" #include using namespace arm_compute; CLHOGDetector::CLHOGDetector() - : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows() + : _hog_detector_kernel(support::cpp14::make_unique()), _detection_windows(nullptr), _num_detection_windows() { } +CLHOGDetector::~CLHOGDetector() = default; + void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class) { configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class); @@ -50,7 +53,7 @@ void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICL _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int)); // Configure HOGDetectorKernel - _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class); + _hog_detector_kernel->configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class); } void CLHOGDetector::run() @@ -62,7 +65,7 @@ void CLHOGDetector::run() q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows); // Run CLHOGDetectorKernel - CLScheduler::get().enqueue(_hog_detector_kernel); + CLScheduler::get().enqueue(*_hog_detector_kernel); // Read number of detections unsigned int num_detection_windows = 0; diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp index 934d1f6351..5f3b9cf529 100644 --- a/src/runtime/CL/functions/CLHOGGradient.cpp +++ b/src/runtime/CL/functions/CLHOGGradient.cpp @@ -26,11 +26,18 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; CLHOGGradient::CLHOGGradient(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy() + : _memory_group(std::move(memory_manager)), + _derivative(), + _mag_phase(support::cpp14::make_unique()), + _gx(), + _gy() { } @@ -63,11 +70,11 @@ void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor // Initialise magnitude/phase kernel if(PhaseType::UNSIGNED == phase_type) { - _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED); + _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED); } else { - _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED); + _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED); } // Allocate intermediate tensors @@ -83,5 +90,5 @@ void CLHOGGradient::run() _derivative.run(); // Run magnitude/phase kernel - CLScheduler::get().enqueue(_mag_phase); + CLScheduler::get().enqueue(*_mag_phase); } \ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp index 51db43cd71..dfc90537cf 100644 --- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp +++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp @@ -30,6 +30,11 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/Scheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLHOGDescriptorKernel.h" +#include "src/core/CL/kernels/CLHOGDetectorKernel.h" +#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; @@ -52,6 +57,8 @@ CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr memory_ { } +CLHOGMultiDetection::~CLHOGMultiDetection() = default; + void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode, uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) { @@ -135,8 +142,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL _num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute _num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute - _orient_bin_kernel.resize(_num_orient_bin_kernel); - _block_norm_kernel.resize(_num_block_norm_kernel); + _orient_bin_kernel.reserve(_num_orient_bin_kernel); + _block_norm_kernel.reserve(_num_block_norm_kernel); _hog_detect_kernel.resize(_num_hog_detect_kernel); _hog_space.resize(_num_orient_bin_kernel); _hog_norm_space.resize(_num_block_norm_kernel); @@ -181,7 +188,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL _memory_group.manage(&_hog_space[i]); // Initialise orientation binning kernel - _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info()); + _orient_bin_kernel.emplace_back(support::cpp14::make_unique()); + _orient_bin_kernel.back()->configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info()); } // Allocate intermediate tensors @@ -202,7 +210,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL _memory_group.manage(&_hog_norm_space[i]); // Initialize block normalization kernel - _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info()); + _block_norm_kernel.emplace_back(support::cpp14::make_unique()); + _block_norm_kernel.back()->configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info()); } // Allocate intermediate tensors @@ -248,13 +257,13 @@ void CLHOGMultiDetection::run() // Run orientation binning kernel for(size_t i = 0; i < _num_orient_bin_kernel; ++i) { - CLScheduler::get().enqueue(_orient_bin_kernel[i], false); + CLScheduler::get().enqueue(*_orient_bin_kernel[i], false); } // Run block normalization kernel for(size_t i = 0; i < _num_block_norm_kernel; ++i) { - CLScheduler::get().enqueue(_block_norm_kernel[i], false); + CLScheduler::get().enqueue(*_block_norm_kernel[i], false); } // Run HOG detector kernel diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp index 45b93a5be0..9d8ebceb30 100644 --- a/src/runtime/CL/functions/CLHarrisCorners.cpp +++ b/src/runtime/CL/functions/CLHarrisCorners.cpp @@ -24,8 +24,6 @@ #include "arm_compute/runtime/CL/functions/CLHarrisCorners.h" #include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" -#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" @@ -35,6 +33,10 @@ #include "arm_compute/runtime/CL/functions/CLSobel7x7.h" #include "arm_compute/runtime/ITensorAllocator.h" #include "arm_compute/runtime/Scheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLHarrisCornersKernel.h" +#include "src/core/CL/kernels/CLSobel5x5Kernel.h" +#include "src/core/CL/kernels/CLSobel7x7Kernel.h" #include "support/MemorySupport.h" #include @@ -45,12 +47,12 @@ using namespace arm_compute; CLHarrisCorners::CLHarrisCorners(std::shared_ptr memory_manager) // NOLINT : _memory_group(std::move(memory_manager)), _sobel(nullptr), - _harris_score(), + _harris_score(support::cpp14::make_unique()), _non_max_suppr(), _candidates(), _sort_euclidean(), - _border_gx(), - _border_gy(), + _border_gx(support::cpp14::make_unique()), + _border_gy(support::cpp14::make_unique()), _gx(), _gy(), _score(), @@ -61,6 +63,8 @@ CLHarrisCorners::CLHarrisCorners(std::shared_ptr memory_manager) { } +CLHarrisCorners::~CLHarrisCorners() = default; + void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist, float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) @@ -133,11 +137,11 @@ void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImag _memory_group.manage(&_score); // Set/init Harris Score kernel accordingly with block_size - _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); + _harris_score->configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); // Configure border filling using harris score kernel's block size - _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); - _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value)); + _border_gx->configure(compile_context, &_gx, _harris_score->border_size(), border_mode, PixelValue(constant_border_value)); + _border_gy->configure(compile_context, &_gy, _harris_score->border_size(), border_mode, PixelValue(constant_border_value)); // Allocate intermediate buffers _gx.allocator()->allocate(); @@ -175,11 +179,11 @@ void CLHarrisCorners::run() _sobel->run(); // Fill border before harris score kernel - CLScheduler::get().enqueue(_border_gx, false); - CLScheduler::get().enqueue(_border_gy, false); + CLScheduler::get().enqueue(*_border_gx, false); + CLScheduler::get().enqueue(*_border_gy, false); // Run harris score kernel - CLScheduler::get().enqueue(_harris_score, false); + CLScheduler::get().enqueue(*_harris_score, false); // Run non-maxima suppression _non_max_suppr.run(); diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp index 4a60ee9d08..bd680f448d 100644 --- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp @@ -23,9 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h" -#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" #include "arm_compute/core/Types.h" - +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp index 8561494242..41e47e77c7 100644 --- a/src/runtime/CL/functions/CLIntegralImage.cpp +++ b/src/runtime/CL/functions/CLIntegralImage.cpp @@ -23,16 +23,20 @@ */ #include "arm_compute/runtime/CL/functions/CLIntegralImage.h" -#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLIntegralImageKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; CLIntegralImage::CLIntegralImage() - : _integral_hor(), _integral_vert() + : _integral_hor(support::cpp14::make_unique()), + _integral_vert(support::cpp14::make_unique()) { } +CLIntegralImage::~CLIntegralImage() = default; + void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, output); @@ -40,12 +44,12 @@ void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output) void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) { - _integral_hor.configure(compile_context, input, output); - _integral_vert.configure(compile_context, output); + _integral_hor->configure(compile_context, input, output); + _integral_vert->configure(compile_context, output); } void CLIntegralImage::run() { - CLScheduler::get().enqueue(_integral_hor, false); - CLScheduler::get().enqueue(_integral_vert); + CLScheduler::get().enqueue(*_integral_hor, false); + CLScheduler::get().enqueue(*_integral_vert); } diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp index 66191d1799..64aac269cd 100644 --- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -24,12 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { @@ -39,10 +42,15 @@ constexpr int max_input_tensor_dim = 3; } // namespace CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() + : _memory_group(std::move(memory_manager)), + _reduce_func(), + _normalize_kernel(support::cpp14::make_unique()), + _sumsq() { } +CLL2NormalizeLayer::~CLL2NormalizeLayer() = default; + void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon); @@ -59,7 +67,7 @@ void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLT // Configure kernels const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE); - _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon); + _normalize_kernel->configure(compile_context, input, &_sumsq, output, axis, epsilon); // Allocate intermediate tensor _sumsq.allocator()->allocate(); @@ -91,6 +99,6 @@ void CLL2NormalizeLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); _reduce_func.run(); - CLScheduler::get().enqueue(_normalize_kernel, true); + CLScheduler::get().enqueue(*_normalize_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index 058b6027c2..b095c06535 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -29,6 +29,22 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLCopyKernel.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "src/core/CL/kernels/CLMemsetKernel.h" +#include "src/core/CL/kernels/CLTransposeKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { @@ -37,20 +53,23 @@ using namespace arm_compute::utils::info_helpers; CLLSTMLayer::CLLSTMLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(), - _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), - _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(), - _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _ones_memset_kernel(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), - _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), - _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), - _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), - _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), - _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), - _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false) + _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), + _transpose_cell_state(support::cpp14::make_unique()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), + _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), + _fully_connected_output_state(), _projection_clip(), _copy_cell_state(support::cpp14::make_unique()), _copy_output(support::cpp14::make_unique()), _concat_scratch_buffer(), + _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), _ones_memset_kernel(support::cpp14::make_unique()), + _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), + _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), + _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), + _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), + _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), + _is_layer_norm_lstm(false) { } +CLLSTMLayer::~CLLSTMLayer() = default; + void CLLSTMLayer::configure(const ICLTensor *input, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, @@ -172,7 +191,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); + _ones_memset_kernel->configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; @@ -241,7 +260,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_cell_state_out1); _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); _memory_group.manage(&_cell_state_out2); - _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2); + _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2); _memory_group.manage(&_cell_state_out3); _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); _cell_state_out2.allocator()->allocate(); @@ -367,8 +386,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe } // Copy cell state and output - _copy_cell_state.configure(compile_context, &_cell_state_out1, cell_state_out); - _copy_output.configure(compile_context, output_state_out, output); + _copy_cell_state->configure(compile_context, &_cell_state_out1, cell_state_out); + _copy_output->configure(compile_context, output_state_out, output); // Vector for holding the tensors to store in scratch buffer std::vector scratch_inputs; @@ -642,7 +661,7 @@ void CLLSTMLayer::run() if(_run_cifg_opt) { - CLScheduler::get().enqueue(_ones_memset_kernel); + CLScheduler::get().enqueue(*_ones_memset_kernel); _subtract_input_gate.run(); } else @@ -665,7 +684,7 @@ void CLLSTMLayer::run() } _fully_connected_cell_state.run(); - CLScheduler::get().enqueue(_transpose_cell_state); + CLScheduler::get().enqueue(*_transpose_cell_state); _gemm_cell_state1.run(); _accum_cell_state1.run(); if(_is_layer_norm_lstm) @@ -711,8 +730,8 @@ void CLLSTMLayer::run() } } - CLScheduler::get().enqueue(_copy_cell_state); - CLScheduler::get().enqueue(_copy_output); + CLScheduler::get().enqueue(*_copy_cell_state); + CLScheduler::get().enqueue(*_copy_output); _concat_scratch_buffer.run(); } diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index 76a531b1c9..46062387e7 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -27,6 +27,14 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp index 81e903cde8..1ad19e56ea 100644 --- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp +++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp @@ -32,6 +32,9 @@ #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" +#include "src/core/CL/kernels/CLGaussianPyramidKernel.h" using namespace arm_compute; diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp index cbb952c3f6..d7fd81754b 100644 --- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp +++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp @@ -23,11 +23,13 @@ */ #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/IPyramid.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp index 74cb47347f..04e59ac4a6 100644 --- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp @@ -27,6 +27,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLCol2ImKernel.h" +#include "src/core/CL/kernels/CLIm2ColKernel.h" +#include "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLWeightsReshapeKernel.h" +#include "support/MemorySupport.h" #include #include @@ -78,8 +83,16 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons } // namespace CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_prepared(false), _original_weights(nullptr) + : _memory_group(std::move(memory_manager)), + _input_im2col_kernel(support::cpp14::make_unique()), + _weights_reshape_kernel(support::cpp14::make_unique()), + _mm_kernel(support::cpp14::make_unique()), + _output_col2im_kernel(support::cpp14::make_unique()), + _input_im2col_reshaped(), + _weights_reshaped(), + _gemm_output(), + _is_prepared(false), + _original_weights(nullptr) { } @@ -169,16 +182,16 @@ void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context, _memory_group.manage(&_gemm_output); // Configure kernels - _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); - _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped); - _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output); - _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h)); + _input_im2col_kernel->configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); + _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped); + _mm_kernel->configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output); + _output_col2im_kernel->configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h)); // Allocate intermediate tensors _input_im2col_reshaped.allocator()->allocate(); _gemm_output.allocator()->allocate(); - CLScheduler::get().tune_kernel_static(_input_im2col_kernel); + CLScheduler::get().tune_kernel_static(*_input_im2col_kernel); } void CLLocallyConnectedLayer::run() @@ -188,13 +201,13 @@ void CLLocallyConnectedLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Run input reshaping - CLScheduler::get().enqueue(_input_im2col_kernel); + CLScheduler::get().enqueue(*_input_im2col_kernel); // Runs vector matrix multiply on reshaped matrices - CLScheduler::get().enqueue(_mm_kernel); + CLScheduler::get().enqueue(*_mm_kernel); // Reshape output matrix - CLScheduler::get().enqueue(_output_col2im_kernel, false); + CLScheduler::get().enqueue(*_output_col2im_kernel.get(), false); } void CLLocallyConnectedLayer::prepare() @@ -205,7 +218,7 @@ void CLLocallyConnectedLayer::prepare() // Run weights reshaping and mark original weights tensor as unused _weights_reshaped.allocator()->allocate(); - CLScheduler::get().enqueue(_weights_reshape_kernel); + CLScheduler::get().enqueue(*_weights_reshape_kernel); _original_weights->mark_as_unused(); CLScheduler::get().queue().finish(); diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp index 962adadbb2..fb3ebdaa96 100644 --- a/src/runtime/CL/functions/CLMagnitude.cpp +++ b/src/runtime/CL/functions/CLMagnitude.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLMagnitude.h" -#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" +#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp index 3e32c55067..392bff2b4e 100644 --- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp +++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp @@ -24,18 +24,23 @@ #include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" +#include "src/core/CL/kernels/CLMemsetKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLMaxUnpoolingLayer::CLMaxUnpoolingLayer() - : _memset_kernel(), _unpooling_layer_kernel() + : _memset_kernel(support::cpp14::make_unique()), + _unpooling_layer_kernel(support::cpp14::make_unique()) { } +CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default; + void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info); @@ -44,9 +49,9 @@ void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTen void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) { const PixelValue zero_value(0.f); - _memset_kernel.configure(output, zero_value); + _memset_kernel->configure(output, zero_value); - _unpooling_layer_kernel.configure(compile_context, input, indices, output, pool_info); + _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info); } Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) @@ -57,9 +62,9 @@ Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo void CLMaxUnpoolingLayer::run() { // Run memset - CLScheduler::get().enqueue(_memset_kernel, false); + CLScheduler::get().enqueue(*_memset_kernel, false); // Run max unpooling layer - CLScheduler::get().enqueue(_unpooling_layer_kernel); + CLScheduler::get().enqueue(*_unpooling_layer_kernel); } } /* namespace arm_compute */ diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp index 2517fdc4ef..c91bc954b8 100644 --- a/src/runtime/CL/functions/CLMeanStdDev.cpp +++ b/src/runtime/CL/functions/CLMeanStdDev.cpp @@ -25,6 +25,10 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLMeanStdDevKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; @@ -39,13 +43,15 @@ CLMeanStdDev::CLMeanStdDev(std::shared_ptr memory_manager) // NO _reduction_output_stddev(), _mean(nullptr), _stddev(nullptr), - _mean_stddev_kernel(), - _fill_border_kernel(), + _mean_stddev_kernel(support::cpp14::make_unique()), + _fill_border_kernel(support::cpp14::make_unique()), _global_sum(), _global_sum_squared() { } +CLMeanStdDev::~CLMeanStdDev() = default; + Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev) { ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input); @@ -101,8 +107,8 @@ void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage * _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong)); } - _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared); - _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0))); + _mean_stddev_kernel->configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared); + _fill_border_kernel->configure(compile_context, input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0))); } } @@ -149,8 +155,8 @@ void CLMeanStdDev::run_float() void CLMeanStdDev::run_int() { - CLScheduler::get().enqueue(_fill_border_kernel); - CLScheduler::get().enqueue(_mean_stddev_kernel); + CLScheduler::get().enqueue(*_fill_border_kernel); + CLScheduler::get().enqueue(*_mean_stddev_kernel); } void CLMeanStdDev::run() diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp index 07ab669fde..5b5ff49ecb 100644 --- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h" -#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp index 92153128f9..2040ebd4f5 100644 --- a/src/runtime/CL/functions/CLMedian3x3.cpp +++ b/src/runtime/CL/functions/CLMedian3x3.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLMedian3x3.h" -#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLMedian3x3Kernel.h" #include "support/MemorySupport.h" #include @@ -41,5 +42,5 @@ void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor * auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp index a27defe2f7..3ddd4d04ed 100644 --- a/src/runtime/CL/functions/CLMinMaxLocation.cpp +++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp @@ -22,14 +22,15 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h" - #include "arm_compute/core/CL/CLHelpers.h" +#include "src/core/CL/kernels/CLMinMaxLocationKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLMinMaxLocation::CLMinMaxLocation() - : _min_max_kernel(), - _min_max_loc_kernel(), + : _min_max_kernel(support::cpp14::make_unique()), + _min_max_loc_kernel(support::cpp14::make_unique()), _min_max_vals(), _min_max_count_vals(), _min(nullptr), @@ -41,6 +42,8 @@ CLMinMaxLocation::CLMinMaxLocation() { } +CLMinMaxLocation::~CLMinMaxLocation() = default; + void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count) { configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count); @@ -62,16 +65,16 @@ void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const _min_loc = min_loc; _max_loc = max_loc; - _min_max_kernel.configure(compile_context, input, &_min_max_vals); - _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc); + _min_max_kernel->configure(compile_context, input, &_min_max_vals); + _min_max_loc_kernel->configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc); } void CLMinMaxLocation::run() { cl::CommandQueue q = CLScheduler::get().queue(); - CLScheduler::get().enqueue(_min_max_kernel, false); - CLScheduler::get().enqueue(_min_max_loc_kernel, false); + CLScheduler::get().enqueue(*_min_max_kernel, false); + CLScheduler::get().enqueue(*_min_max_loc_kernel, false); // Update min and max q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast(_min)); diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp index 71f08e8072..3312f6f9a7 100644 --- a/src/runtime/CL/functions/CLNonLinearFilter.cpp +++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp @@ -23,7 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h" -#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLNonLinearFilterKernel.h" #include "support/MemorySupport.h" #include @@ -42,5 +43,5 @@ void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTe auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp index a79bb0c5a3..22ca176a71 100644 --- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp +++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp @@ -23,7 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h" -#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" #include "support/MemorySupport.h" #include @@ -43,10 +44,10 @@ void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_contex if(border_mode != BorderMode::UNDEFINED) { - _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT); + _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT); } else { - _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED); + _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED); } } diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index 4be6257bbf..40a6cdd2f4 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -25,18 +25,25 @@ #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; CLNormalizationLayer::CLNormalizationLayer() - : _norm_kernel(), _border_handler() + : _norm_kernel(support::cpp14::make_unique()), + _border_handler(support::cpp14::make_unique()) { } +CLNormalizationLayer::~CLNormalizationLayer() = default; + void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); @@ -47,10 +54,10 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC ARM_COMPUTE_ERROR_ON(input == nullptr); // Configure normalization kernel - _norm_kernel.configure(compile_context, input, output, norm_info); + _norm_kernel->configure(compile_context, input, output, norm_info); // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue()); } Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) @@ -61,8 +68,8 @@ Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInf void CLNormalizationLayer::run() { // Run border handler - CLScheduler::get().enqueue(_border_handler, false); + CLScheduler::get().enqueue(*_border_handler, false); // Run normalization kernel - CLScheduler::get().enqueue(_norm_kernel); + CLScheduler::get().enqueue(*_norm_kernel); } diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp index 806e6489a2..9576486db0 100644 --- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp @@ -24,7 +24,7 @@ #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h" -#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" +#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp index 0b5547eaab..fca6192296 100644 --- a/src/runtime/CL/functions/CLOpticalFlow.cpp +++ b/src/runtime/CL/functions/CLOpticalFlow.cpp @@ -24,7 +24,6 @@ #include "arm_compute/runtime/CL/functions/CLOpticalFlow.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Window.h" @@ -33,6 +32,8 @@ #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" #include "arm_compute/runtime/CL/functions/CLScharr3x3.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLLKTrackerKernel.h" #include "support/MemorySupport.h" using namespace arm_compute; @@ -42,7 +43,7 @@ CLOpticalFlow::CLOpticalFlow(std::shared_ptr memory_manager) // _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), - _tracker_finalize_kernel(), + _tracker_finalize_kernel(support::cpp14::make_unique()), _func_scharr(), _scharr_gx(), _scharr_gy(), @@ -57,6 +58,8 @@ CLOpticalFlow::CLOpticalFlow(std::shared_ptr memory_manager) // { } +CLOpticalFlow::~CLOpticalFlow() = default; + void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, @@ -93,9 +96,9 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP const int old_values_list_length = list_length * window_dimension * window_dimension; // Create kernels and tensors - _tracker_init_kernel.resize(_num_levels); - _tracker_stage0_kernel.resize(_num_levels); - _tracker_stage1_kernel.resize(_num_levels); + _tracker_init_kernel.reserve(_num_levels); + _tracker_stage0_kernel.reserve(_num_levels); + _tracker_stage1_kernel.reserve(_num_levels); _func_scharr.resize(_num_levels); _scharr_gx.resize(_num_levels); _scharr_gy.resize(_num_levels); @@ -134,16 +137,19 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value); // Init Lucas-Kanade init kernel - _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale); + _tracker_init_kernel.emplace_back(support::cpp14::make_unique()); + _tracker_init_kernel.back()->configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale); // Init Lucas-Kanade stage0 kernel - _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], - _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), - window_dimension, i); + _tracker_stage0_kernel.emplace_back(support::cpp14::make_unique()); + _tracker_stage0_kernel.back()->configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], + _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), + window_dimension, i); // Init Lucas-Kanade stage1 kernel - _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), - termination, epsilon, num_iterations, window_dimension, i); + _tracker_stage1_kernel.emplace_back(support::cpp14::make_unique()); + _tracker_stage1_kernel.back()->configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), + termination, epsilon, num_iterations, window_dimension, i); // Allocate intermediate buffers _scharr_gx[i].allocator()->allocate(); @@ -151,7 +157,7 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP } // Finalize Lucas-Kanade - _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points); + _tracker_finalize_kernel->configure(compile_context, _new_points_internal.get(), new_points); } void CLOpticalFlow::run() @@ -166,14 +172,14 @@ void CLOpticalFlow::run() _func_scharr[level - 1].run(); // Run Lucas-Kanade init kernel - CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]); + CLScheduler::get().enqueue(*_tracker_init_kernel[level - 1]); // Run Lucas-Kanade stage0 kernel - CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]); + CLScheduler::get().enqueue(*_tracker_stage0_kernel[level - 1]); // Run Lucas-Kanade stage1 kernel - CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]); + CLScheduler::get().enqueue(*_tracker_stage1_kernel[level - 1]); } - CLScheduler::get().enqueue(_tracker_finalize_kernel, true); + CLScheduler::get().enqueue(*_tracker_finalize_kernel, true); } diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index aaddd46071..60cf4d1a2d 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" +#include "src/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/runtime/CL/CLScheduler.h" diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp index fb6078cc79..388b07b76e 100644 --- a/src/runtime/CL/functions/CLPadLayer.cpp +++ b/src/runtime/CL/functions/CLPadLayer.cpp @@ -22,14 +22,21 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPadLayer.h" +#include "src/core/CL/kernels/CLCopyKernel.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLPadLayer::CLPadLayer() - : _pad_kernel(), _copy_kernel(), _perform_pad(false) + : _pad_kernel(support::cpp14::make_unique()), + _copy_kernel(support::cpp14::make_unique()), + _perform_pad(false) { } +CLPadLayer::~CLPadLayer() = default; + void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); @@ -46,12 +53,12 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i if(_perform_pad) { - _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode); + _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); } else { // Copy the input to the whole output if no padding is applied - _copy_kernel.configure(compile_context, input, output); + _copy_kernel->configure(compile_context, input, output); } } Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) @@ -75,11 +82,11 @@ void CLPadLayer::run() { if(_perform_pad) { - CLScheduler::get().enqueue(_pad_kernel); + CLScheduler::get().enqueue(*_pad_kernel); } else { - CLScheduler::get().enqueue(_copy_kernel); + CLScheduler::get().enqueue(*_copy_kernel); } } } // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp index e13046bd46..f7f0bc4f5d 100644 --- a/src/runtime/CL/functions/CLPermute.cpp +++ b/src/runtime/CL/functions/CLPermute.cpp @@ -24,8 +24,8 @@ #include "arm_compute/runtime/CL/functions/CLPermute.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLPermuteKernel.h" #include "arm_compute/core/Error.h" +#include "src/core/CL/kernels/CLPermuteKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp index 64d2e0fdff..6594cd5bac 100644 --- a/src/runtime/CL/functions/CLPhase.cpp +++ b/src/runtime/CL/functions/CLPhase.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLPhase.h" -#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" +#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index 883ce68536..12cc5d60af 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" #include "support/MemorySupport.h" #include @@ -55,7 +56,7 @@ ITensorPack select_border_input(ITensorPack &tensors) namespace experimental { CLPixelWiseMultiplication::CLPixelWiseMultiplication() - : _border_handler() + : _border_handler(support::cpp14::make_unique()) { } @@ -72,7 +73,7 @@ void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_contex if(broadcasted_info->dimension(0) == 1) { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } @@ -86,12 +87,12 @@ Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen void CLPixelWiseMultiplication::run(ITensorPack &tensors) { auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); + CLScheduler::get().enqueue_op(*_border_handler, border_pack); ICLOperator::run(tensors); } CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() - : _border_handler() + : _border_handler(support::cpp14::make_unique()) { } @@ -107,7 +108,7 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile if(broadcasted_info->dimension(0) == 1) { - _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } @@ -120,7 +121,7 @@ Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, con void CLComplexPixelWiseMultiplication::run(ITensorPack &tensors) { auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); + CLScheduler::get().enqueue_op(*_border_handler, border_pack); ICLOperator::run(tensors); } } // namespace experimental diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index a14818fffe..7f99aee9ba 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPoolingLayerKernel.h" #include "support/MemorySupport.h" namespace arm_compute @@ -79,7 +80,7 @@ void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTenso default: ARM_COMPUTE_ERROR("Data layout not supported"); } - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value); // Tune kernels CLScheduler::get().tune_kernel_static(*_kernel); diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp index fefbff639d..8cb971793e 100644 --- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp +++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp @@ -24,13 +24,13 @@ #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h" -#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" - +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" #include "support/MemorySupport.h" using namespace arm_compute; diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index 2d21d210e4..54df5a0a5e 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -30,7 +30,18 @@ #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLCopyKernel.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" +#include "support/MemorySupport.h" namespace arm_compute { @@ -86,10 +97,50 @@ void CLQLSTMLayer::TensorCopyKernel::run() } CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr memory_manager) + : _input_to_input_reduction(support::cpp14::make_unique()), + _recurrent_to_input_reduction(support::cpp14::make_unique()), + _input_to_forget_reduction(support::cpp14::make_unique()), + _recurrent_to_forget_reduction(support::cpp14::make_unique()), + _input_to_cell_reduction(support::cpp14::make_unique()), + _recurrent_to_cell_reduction(support::cpp14::make_unique()), + _input_to_output_reduction(support::cpp14::make_unique()), + _recurrent_to_output_reduction(support::cpp14::make_unique()), + _projection_reduction(support::cpp14::make_unique()), + _layer_norms(), + _copy_output(support::cpp14::make_unique()) { + for(auto &norm : _layer_norms) + { + norm = support::cpp14::make_unique(); + } + _memory_group = MemoryGroup(std::move(memory_manager)); } +CLQLSTMLayer::~CLQLSTMLayer() = default; + +void CLQLSTMLayer::configure_layer_norm(LayerNormGate g, const ICLTensor *in) +{ + ARM_COMPUTE_ERROR_ON(!_has_layer_norm); + + CLTensor *out = &get_layer_norm_output(g); + _memory_group.manage(out); + out->allocator()->init(*(in->info())); + + get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g)); +} + +Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias) +{ + // Output quantization scale will be different, but ignored here + // since it will be configured at configure() stage. + const TensorInfo out + { + in + }; + return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); +} + void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, @@ -200,18 +251,18 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); if(_has_projection) { - _projection_reduction.configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); if(_projection_bias != nullptr) { _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); @@ -543,7 +594,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT } // Copy output_state_out to output - _copy_output.configure(compile_context, output_state_out, output); + _copy_output->configure(compile_context, output_state_out, output); } Status CLQLSTMLayer::validate(const ITensorInfo *input, @@ -1049,7 +1100,7 @@ void CLQLSTMLayer::run() } // Copy output_state_out to output - CLScheduler::get().enqueue(_copy_output); + CLScheduler::get().enqueue(*_copy_output); } void CLQLSTMLayer::prepare() @@ -1081,8 +1132,8 @@ void CLQLSTMLayer::prepare() { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(_input_to_input_reduction); - CLScheduler::get().enqueue(_recurrent_to_input_reduction); + CLScheduler::get().enqueue(*_input_to_input_reduction); + CLScheduler::get().enqueue(*_recurrent_to_input_reduction); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1097,17 +1148,17 @@ void CLQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(_input_to_forget_reduction); - CLScheduler::get().enqueue(_recurrent_to_forget_reduction); - CLScheduler::get().enqueue(_input_to_cell_reduction); - CLScheduler::get().enqueue(_recurrent_to_cell_reduction); - CLScheduler::get().enqueue(_input_to_output_reduction); - CLScheduler::get().enqueue(_recurrent_to_output_reduction); + CLScheduler::get().enqueue(*_input_to_forget_reduction); + CLScheduler::get().enqueue(*_recurrent_to_forget_reduction); + CLScheduler::get().enqueue(*_input_to_cell_reduction); + CLScheduler::get().enqueue(*_recurrent_to_cell_reduction); + CLScheduler::get().enqueue(*_input_to_output_reduction); + CLScheduler::get().enqueue(*_recurrent_to_output_reduction); if(_has_projection) { _projection_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(_projection_reduction); + CLScheduler::get().enqueue(*_projection_reduction); if(_projection_bias != nullptr) { _projection_bias_add.run(); diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp index f0a446acab..f132547eb9 100644 --- a/src/runtime/CL/functions/CLQuantizationLayer.cpp +++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" -#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h" +#include "src/core/CL/kernels/CLQuantizationLayerKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 94e7f9440c..be3e539f98 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -28,17 +28,33 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLCopyKernel.h" +#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), - _is_prepared(false) + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy_kernel(support::cpp14::make_unique()), _fully_connected_out(), + _gemm_output(), _add_output(), _is_prepared(false) { } +CLRNNLayer::~CLRNNLayer() = default; + Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output, const ActivationLayerInfo &info) { @@ -107,7 +123,7 @@ void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTen _activation.configure(compile_context, &_add_output, hidden_state, info); _add_output.allocator()->allocate(); - _copy_kernel.configure(compile_context, hidden_state, output); + _copy_kernel->configure(compile_context, hidden_state, output); } void CLRNNLayer::run() @@ -122,7 +138,7 @@ void CLRNNLayer::run() _activation.run(); // copy hidden out to output - CLScheduler::get().enqueue(_copy_kernel); + CLScheduler::get().enqueue(*_copy_kernel); } void CLRNNLayer::prepare() diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp index 2337cee33f..cf28a1a0fb 100644 --- a/src/runtime/CL/functions/CLROIAlignLayer.cpp +++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp @@ -24,7 +24,8 @@ #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h" #include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h" +#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp index cdf60ce04f..b0e6716cce 100644 --- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp @@ -22,10 +22,8 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" - #include "arm_compute/core/CL/ICLArray.h" - -#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "support/MemorySupport.h" using namespace arm_compute; diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp index 8bf2a0c43e..57b57bd305 100644 --- a/src/runtime/CL/functions/CLRange.cpp +++ b/src/runtime/CL/functions/CLRange.cpp @@ -24,10 +24,10 @@ #include "arm_compute/runtime/CL/functions/CLRange.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLRangeKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLRangeKernel.h" #include "support/MemorySupport.h" using namespace arm_compute; diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index 4ea7f7642f..b761dc2f99 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -24,11 +24,12 @@ #include "arm_compute/runtime/CL/functions/CLReduceMean.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/CL/CLValidate.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index 208371c45d..7423f4bc87 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -30,9 +30,10 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" - #include "support/MemorySupport.h" namespace arm_compute @@ -43,6 +44,8 @@ CLReductionOperation::CLReductionOperation(std::shared_ptr memor { } +CLReductionOperation::~CLReductionOperation() = default; + Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -211,7 +214,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC } // Configure reduction operation kernels - _reduction_kernels_vector.resize(_num_of_stages); + _reduction_kernels_vector.reserve(_num_of_stages); // Create temporary tensors if(_is_serial) @@ -221,11 +224,12 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC _memory_group.manage(&_results_vector.back()); } - _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0); + _reduction_kernels_vector.emplace_back(support::cpp14::make_unique()); + _reduction_kernels_vector[0]->configure(compile_context, input, output_internal, axis, op, 0); } else { - _border_handlers_vector.resize(_num_of_stages); + _border_handlers_vector.reserve(_num_of_stages); _memory_group.manage(&_results_vector[0]); ReductionOperation first_kernel_op; @@ -269,15 +273,23 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC ARM_COMPUTE_ERROR("Not supported"); } - _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op); - _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue); + _reduction_kernels_vector.emplace_back(support::cpp14::make_unique()); + _reduction_kernels_vector[0]->configure(compile_context, input, &_results_vector[0], axis, first_kernel_op); + + _border_handlers_vector.emplace_back(support::cpp14::make_unique()); + _border_handlers_vector[0]->configure(compile_context, input, _reduction_kernels_vector[0]->border_size(), BorderMode::CONSTANT, pixelValue); // Apply ReductionOperation on intermediate stages for(unsigned int i = 1; i < _num_of_stages - 1; ++i) { _memory_group.manage(&_results_vector[i]); - _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op); - _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue); + + _reduction_kernels_vector.emplace_back(support::cpp14::make_unique()); + _reduction_kernels_vector[i]->configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op); + + _border_handlers_vector.emplace_back(support::cpp14::make_unique()); + _border_handlers_vector[i]->configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i]->border_size(), BorderMode::CONSTANT, pixelValue); + _results_vector[i - 1].allocator()->allocate(); } @@ -290,8 +302,12 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC _memory_group.manage(&_results_vector.back()); } - _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width); - _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue); + _reduction_kernels_vector.emplace_back(support::cpp14::make_unique()); + _reduction_kernels_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width); + + _border_handlers_vector.emplace_back(support::cpp14::make_unique()); + _border_handlers_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage]->border_size(), BorderMode::CONSTANT, pixelValue); + _results_vector[last_stage - 1].allocator()->allocate(); } @@ -308,14 +324,14 @@ void CLReductionOperation::run() if(_is_serial) { - CLScheduler::get().enqueue(_reduction_kernels_vector[0], false); + CLScheduler::get().enqueue(*_reduction_kernels_vector[0], false); } else { for(unsigned int i = 0; i < _num_of_stages; ++i) { - CLScheduler::get().enqueue(_border_handlers_vector[i], false); - CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); + CLScheduler::get().enqueue(*_border_handlers_vector[i], false); + CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false); } } diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp index 1e3d614402..6466c2843b 100644 --- a/src/runtime/CL/functions/CLRemap.cpp +++ b/src/runtime/CL/functions/CLRemap.cpp @@ -24,11 +24,12 @@ #include "arm_compute/runtime/CL/functions/CLRemap.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLRemapKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLRemapKernel.h" #include "support/MemorySupport.h" #include @@ -53,5 +54,5 @@ void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *inpu auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp index 1dc41aefb5..4b2f70334f 100644 --- a/src/runtime/CL/functions/CLReorgLayer.cpp +++ b/src/runtime/CL/functions/CLReorgLayer.cpp @@ -24,10 +24,10 @@ #include "arm_compute/runtime/CL/functions/CLReorgLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp index 273a761a0a..5112064b23 100644 --- a/src/runtime/CL/functions/CLReshapeLayer.cpp +++ b/src/runtime/CL/functions/CLReshapeLayer.cpp @@ -24,7 +24,7 @@ #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h" +#include "src/core/CL/kernels/CLReshapeLayerKernel.h" #include "support/MemorySupport.h" /** [CLReshapeLayer snippet] **/ diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp index 213fbc8f32..b73d8de62e 100644 --- a/src/runtime/CL/functions/CLReverse.cpp +++ b/src/runtime/CL/functions/CLReverse.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLReverse.h" -#include "arm_compute/core/CL/kernels/CLReverseKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLReverseKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index e111c6d1f7..383b0cc305 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -24,10 +24,11 @@ #include "arm_compute/runtime/CL/functions/CLScale.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLScaleKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLScaleKernel.h" #include "support/MemorySupport.h" namespace arm_compute @@ -60,7 +61,7 @@ void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *inpu { border_mode_to_use = BorderMode::CONSTANT; } - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode_to_use, info.constant_border_value); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode_to_use, info.constant_border_value); } void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp index b121ee7b99..e5d0d2d630 100644 --- a/src/runtime/CL/functions/CLScharr3x3.cpp +++ b/src/runtime/CL/functions/CLScharr3x3.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLScharr3x3.h" -#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLScharr3x3Kernel.h" #include "support/MemorySupport.h" #include @@ -41,5 +42,5 @@ void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor * auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp index ef8010847b..374da91b78 100644 --- a/src/runtime/CL/functions/CLSelect.cpp +++ b/src/runtime/CL/functions/CLSelect.cpp @@ -23,9 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLSelect.h" -#include "arm_compute/core/CL/kernels/CLSelectKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLSelectKernel.h" #include "support/MemorySupport.h" diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp index f36550ba91..940540563a 100644 --- a/src/runtime/CL/functions/CLSlice.cpp +++ b/src/runtime/CL/functions/CLSlice.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLSlice.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp index 566a4a1534..78376f935a 100644 --- a/src/runtime/CL/functions/CLSobel3x3.cpp +++ b/src/runtime/CL/functions/CLSobel3x3.cpp @@ -23,14 +23,17 @@ */ #include "arm_compute/runtime/CL/functions/CLSobel3x3.h" -#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLSobel3x3Kernel.h" #include "support/MemorySupport.h" #include using namespace arm_compute; +CLSobel3x3::~CLSobel3x3() = default; + void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); @@ -41,5 +44,5 @@ void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *i auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp index f70e4f36f5..fa5d8945fb 100644 --- a/src/runtime/CL/functions/CLSobel5x5.cpp +++ b/src/runtime/CL/functions/CLSobel5x5.cpp @@ -24,20 +24,29 @@ #include "arm_compute/runtime/CL/functions/CLSobel5x5.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLSobel5x5Kernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; CLSobel5x5::CLSobel5x5(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y() + : _memory_group(std::move(memory_manager)), + _sobel_hor(support::cpp14::make_unique()), + _sobel_vert(support::cpp14::make_unique()), + _border_handler(support::cpp14::make_unique()), + _tmp_x(), + _tmp_y() { } +CLSobel5x5::~CLSobel5x5() = default; + void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); @@ -58,8 +67,8 @@ void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *i _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); _tmp_y.allocator()->allocate(); } @@ -67,27 +76,27 @@ void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *i { _tmp_x.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); - _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); } else if(run_sobel_y) { _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); _tmp_y.allocator()->allocate(); } - _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value)); } void CLSobel5x5::run() { - CLScheduler::get().enqueue(_border_handler, false); + CLScheduler::get().enqueue(*_border_handler, false); MemoryGroupResourceScope scope_mg(_memory_group); - CLScheduler::get().enqueue(_sobel_hor, false); - CLScheduler::get().enqueue(_sobel_vert); + CLScheduler::get().enqueue(*_sobel_hor, false); + CLScheduler::get().enqueue(*_sobel_vert); } diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp index 792432e841..f462adb0ed 100644 --- a/src/runtime/CL/functions/CLSobel7x7.cpp +++ b/src/runtime/CL/functions/CLSobel7x7.cpp @@ -24,20 +24,29 @@ #include "arm_compute/runtime/CL/functions/CLSobel7x7.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLSobel7x7Kernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; CLSobel7x7::CLSobel7x7(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y() + : _memory_group(std::move(memory_manager)), + _sobel_hor(support::cpp14::make_unique()), + _sobel_vert(support::cpp14::make_unique()), + _border_handler(support::cpp14::make_unique()), + _tmp_x(), + _tmp_y() { } +CLSobel7x7::~CLSobel7x7() = default; + void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); @@ -58,8 +67,8 @@ void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *i _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); _tmp_y.allocator()->allocate(); } @@ -67,27 +76,27 @@ void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *i { _tmp_x.allocator()->init(tensor_info); _memory_group.manage(&_tmp_x); - _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); + _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); } else if(run_sobel_y) { _tmp_y.allocator()->init(tensor_info); _memory_group.manage(&_tmp_y); - _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); + _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); + _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); _tmp_y.allocator()->allocate(); } - _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value)); } void CLSobel7x7::run() { - CLScheduler::get().enqueue(_border_handler, false); + CLScheduler::get().enqueue(*_border_handler, false); MemoryGroupResourceScope scope_mg(_memory_group); - CLScheduler::get().enqueue(_sobel_hor, false); - CLScheduler::get().enqueue(_sobel_vert); + CLScheduler::get().enqueue(*_sobel_hor, false); + CLScheduler::get().enqueue(*_sobel_vert); } diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp index 759c8706a1..4caf91488e 100644 --- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp +++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp @@ -24,24 +24,38 @@ #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/ICLKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h" #include "src/core/helpers/SoftmaxHelpers.h" +#include "support/MemorySupport.h" namespace arm_compute { template CLSoftmaxLayerGeneric::CLSoftmaxLayerGeneric(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _permute_input(), _permute_output(), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp(), _input_permuted(), _output_permuted(), + : _memory_group(std::move(memory_manager)), + _permute_input(), + _permute_output(), + _max_shift_exp_sum_kernel(support::cpp14::make_unique()), + _norm_kernel(support::cpp14::make_unique()), + _max(), + _sum(), + _tmp(), + _input_permuted(), + _output_permuted(), _needs_permute() { } +template +CLSoftmaxLayerGeneric::~CLSoftmaxLayerGeneric() = default; + template void CLSoftmaxLayerGeneric::configure(const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) { @@ -78,7 +92,7 @@ void CLSoftmaxLayerGeneric::configure(const CLCompileContext &compile_co _sum.allocator()->init(tmp_input->info()->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type)); // Set GPU target to kernels - _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target()); + _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target()); // Manage intermediate buffers _memory_group.manage(&_tmp); @@ -91,8 +105,8 @@ void CLSoftmaxLayerGeneric::configure(const CLCompileContext &compile_co softmax_info.input_data_type = tmp_input->info()->data_type(); // Configure kernels - _max_shift_exp_sum_kernel.configure(compile_context, tmp_input, &_max, &_tmp, &_sum, softmax_info); - _norm_kernel.configure(compile_context, &_tmp, &_sum, tmp_output, softmax_info); + _max_shift_exp_sum_kernel->configure(compile_context, tmp_input, &_max, &_tmp, &_sum, softmax_info); + _norm_kernel->configure(compile_context, &_tmp, &_sum, tmp_output, softmax_info); // Allocate intermediate buffers _tmp.allocator()->allocate(); @@ -156,8 +170,8 @@ void CLSoftmaxLayerGeneric::run() _permute_input.run(); } - CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false); - CLScheduler::get().enqueue(_norm_kernel, !_needs_permute); + CLScheduler::get().enqueue(*_max_shift_exp_sum_kernel, false); + CLScheduler::get().enqueue(*_norm_kernel, !_needs_permute); if(_needs_permute) { diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp index eea3cb535f..e83def5677 100644 --- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp @@ -29,14 +29,21 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLMemsetKernel.h" +#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLSpaceToBatchLayer::CLSpaceToBatchLayer() - : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) + : _space_to_batch_kernel(support::cpp14::make_unique()), + _memset_kernel(support::cpp14::make_unique()), + _has_padding(false) { } +CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default; + void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); @@ -49,9 +56,9 @@ void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, con if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _memset_kernel->configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output); } void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output) @@ -67,9 +74,9 @@ void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, con if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _memset_kernel->configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); } Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) @@ -94,8 +101,8 @@ void CLSpaceToBatchLayer::run() // Zero out output only if we have paddings if(_has_padding) { - CLScheduler::get().enqueue(_memset_kernel, true); + CLScheduler::get().enqueue(*_memset_kernel, true); } - CLScheduler::get().enqueue(_space_to_batch_kernel, true); + CLScheduler::get().enqueue(*_space_to_batch_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp index 06aa92d6fa..db8c4953cc 100644 --- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp @@ -29,14 +29,18 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLSpaceToDepthLayer::CLSpaceToDepthLayer() - : _space_to_depth_kernel() + : _space_to_depth_kernel(support::cpp14::make_unique()) { } +CLSpaceToDepthLayer::~CLSpaceToDepthLayer() = default; + void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape) { configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); @@ -44,7 +48,7 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) { - _space_to_depth_kernel.configure(compile_context, input, output, block_shape); + _space_to_depth_kernel->configure(compile_context, input, output, block_shape); } Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) @@ -54,6 +58,6 @@ Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo void CLSpaceToDepthLayer::run() { - CLScheduler::get().enqueue(_space_to_depth_kernel, true); + CLScheduler::get().enqueue(*_space_to_depth_kernel, true); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp index 39f0ab4779..f4aa78a72d 100644 --- a/src/runtime/CL/functions/CLStackLayer.cpp +++ b/src/runtime/CL/functions/CLStackLayer.cpp @@ -32,6 +32,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLStackLayerKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { @@ -42,6 +44,8 @@ CLStackLayer::CLStackLayer() // NOLINT { } +CLStackLayer::~CLStackLayer() = default; + void CLStackLayer::configure(const std::vector &input, int axis, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, axis, output); @@ -50,14 +54,15 @@ void CLStackLayer::configure(const std::vector &input, int axis, IC void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector &input, int axis, ICLTensor *output) { _num_inputs = input.size(); - _stack_kernels.resize(_num_inputs); + _stack_kernels.reserve(_num_inputs); // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast(input[0]->info()->num_dimensions() + 1)); for(unsigned int i = 0; i < _num_inputs; i++) { - _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output); + _stack_kernels.emplace_back(support::cpp14::make_unique()); + _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output); } } @@ -87,7 +92,7 @@ void CLStackLayer::run() { for(unsigned i = 0; i < _num_inputs; i++) { - CLScheduler::get().enqueue(_stack_kernels[i], false); + CLScheduler::get().enqueue(*_stack_kernels[i], false); } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp index b78073dd67..3f6814f5ce 100644 --- a/src/runtime/CL/functions/CLStridedSlice.cpp +++ b/src/runtime/CL/functions/CLStridedSlice.cpp @@ -24,8 +24,8 @@ #include "arm_compute/runtime/CL/functions/CLStridedSlice.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp index 3d2d1853ca..8282f37e4b 100644 --- a/src/runtime/CL/functions/CLTableLookup.cpp +++ b/src/runtime/CL/functions/CLTableLookup.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLTableLookup.h" -#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h" +#include "src/core/CL/kernels/CLTableLookupKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp index bdbf37e841..250f6f034f 100644 --- a/src/runtime/CL/functions/CLThreshold.cpp +++ b/src/runtime/CL/functions/CLThreshold.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLThreshold.h" -#include "arm_compute/core/CL/kernels/CLThresholdKernel.h" +#include "src/core/CL/kernels/CLThresholdKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp index 68efad0125..8384e48baf 100644 --- a/src/runtime/CL/functions/CLTile.cpp +++ b/src/runtime/CL/functions/CLTile.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLTile.h" -#include "arm_compute/core/CL/kernels/CLTileKernel.h" +#include "src/core/CL/kernels/CLTileKernel.h" #include "support/MemorySupport.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp index 8cade66a90..43fa7a012a 100644 --- a/src/runtime/CL/functions/CLTranspose.cpp +++ b/src/runtime/CL/functions/CLTranspose.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLTranspose.h" -#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" +#include "src/core/CL/kernels/CLTransposeKernel.h" #include "support/MemorySupport.h" #include diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp index e9456c100b..10b4b76a5e 100644 --- a/src/runtime/CL/functions/CLUpsampleLayer.cpp +++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp @@ -26,15 +26,19 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLUpsampleLayerKernel.h" +#include "support/MemorySupport.h" namespace arm_compute { CLUpsampleLayer::CLUpsampleLayer() // NOLINT - : _upsample(), + : _upsample(support::cpp14::make_unique()), _output(nullptr) { } +CLUpsampleLayer::~CLUpsampleLayer() = default; + Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy upsampling_policy) { @@ -53,11 +57,11 @@ void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTens ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _output = output; - _upsample.configure(compile_context, input, _output, info, upsampling_policy); + _upsample->configure(compile_context, input, _output, info, upsampling_policy); } void CLUpsampleLayer::run() { - CLScheduler::get().enqueue(_upsample, false); + CLScheduler::get().enqueue(*_upsample, false); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp index fffc58c8d0..86e5a7bd86 100644 --- a/src/runtime/CL/functions/CLWarpAffine.cpp +++ b/src/runtime/CL/functions/CLWarpAffine.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLWarpAffine.h" -#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLWarpAffineKernel.h" #include "support/MemorySupport.h" #include @@ -42,5 +43,5 @@ void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, matrix, policy); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp index 2b4b187e38..7e8bc5cdff 100644 --- a/src/runtime/CL/functions/CLWarpPerspective.cpp +++ b/src/runtime/CL/functions/CLWarpPerspective.cpp @@ -23,8 +23,9 @@ */ #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h" -#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h" #include "arm_compute/core/PixelValue.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h" #include "support/MemorySupport.h" #include @@ -42,5 +43,5 @@ void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTe auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, matrix, policy); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index 7ad017f918..7af42904e8 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -28,6 +28,15 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" +#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h" +#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h" +#include "support/MemorySupport.h" using namespace arm_compute; @@ -90,11 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz } // namespace CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr), - _is_prepared(false) + : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(support::cpp14::make_unique()), + _output_transform(support::cpp14::make_unique()), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr), _is_prepared(false) { } +CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default; + void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) { @@ -139,7 +150,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte _input_transform.configure(compile_context, input, &_input0, winograd_info); // Configure filter transform - _filter_transform.configure(compile_context, weights, &_input1, winograd_info); + _filter_transform->configure(compile_context, weights, &_input1, winograd_info); // Configure batched matrix multiply _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, @@ -147,7 +158,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte (input->info()->data_type() == DataType::F16))); // Configure output transform - _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info); + _output_transform->configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info); // Allocate temporary tensors _input0.allocator()->allocate(); @@ -218,7 +229,7 @@ void CLWinogradConvolutionLayer::run() _batched_mm.run(); // Run output transform - CLScheduler::get().enqueue(_output_transform); + CLScheduler::get().enqueue(*_output_transform); } void CLWinogradConvolutionLayer::prepare() @@ -227,7 +238,7 @@ void CLWinogradConvolutionLayer::prepare() { // Run filter transform and mark original weights as unused _input1.allocator()->allocate(); - CLScheduler::get().enqueue(_filter_transform, false); + CLScheduler::get().enqueue(*_filter_transform, false); _original_weights->mark_as_unused(); // Prepare GEMM and release reshaped weights if marked unused by CLGEMM diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp index 9498206549..308c41f714 100644 --- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp +++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h" #include "arm_compute/core/Error.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h" #include "support/MemorySupport.h" using namespace arm_compute; @@ -40,7 +41,7 @@ void CLWinogradInputTransform::configure(const CLCompileContext &compile_context auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, input, output, winograd_info); _kernel = std::move(k); - _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue()); } Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp index d553f97009..46bf220b0c 100644 --- a/src/runtime/CL/functions/CLYOLOLayer.cpp +++ b/src/runtime/CL/functions/CLYOLOLayer.cpp @@ -23,8 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLYOLOLayer.h" -#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/CL/kernels/CLYOLOLayerKernel.h" #include "support/MemorySupport.h" using namespace arm_compute; -- cgit v1.2.1