From 658039bc4e06be34272eccf559a516a6b52f75f5 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 15 Sep 2017 16:30:50 +0100 Subject: COMPMID-534: Add MemoryManager support in NEON functions Adds support for: -NECannyEdge -NEConvolution -NEDirectConvolution -NEGEMM -NEGEMMLowp -NEGaussian5x5 -NEHOGDescriptor -NEHOGGradient -NEL2Normalize -NELocallyConnectedLayer -NENormalizationLayer -NEScale -NESobel5x5 -NESobel7x7 Change-Id: I68e05aa6054372fa873a882633a15fb97882c00d Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87926 Reviewed-by: Pablo Tello Tested-by: Kaizen --- arm_compute/runtime/NEON/functions/NECannyEdge.h | 5 ++- arm_compute/runtime/NEON/functions/NEConvolution.h | 6 +++- .../NEON/functions/NEDirectConvolutionLayer.h | 7 +++- arm_compute/runtime/NEON/functions/NEGEMM.h | 7 +++- arm_compute/runtime/NEON/functions/NEGEMMLowp.h | 7 +++- arm_compute/runtime/NEON/functions/NEGaussian5x5.h | 6 +++- .../runtime/NEON/functions/NEHOGDescriptor.h | 7 +++- arm_compute/runtime/NEON/functions/NEHOGGradient.h | 5 ++- .../runtime/NEON/functions/NEHOGMultiDetection.h | 7 +++- arm_compute/runtime/NEON/functions/NEL2Normalize.h | 7 +++- .../NEON/functions/NELocallyConnectedLayer.h | 7 +++- .../runtime/NEON/functions/NENormalizationLayer.h | 8 +++-- arm_compute/runtime/NEON/functions/NEOpticalFlow.h | 5 ++- arm_compute/runtime/NEON/functions/NEScale.h | 23 +++++++++---- arm_compute/runtime/NEON/functions/NESobel5x5.h | 6 +++- arm_compute/runtime/NEON/functions/NESobel7x7.h | 6 +++- scripts/clang_tidy_rules.py | 1 + src/runtime/NEON/functions/NECannyEdge.cpp | 32 ++++++++++++++---- src/runtime/NEON/functions/NEConvolution.cpp | 12 +++++-- .../NEON/functions/NEDirectConvolutionLayer.cpp | 11 +++++-- src/runtime/NEON/functions/NEGEMM.cpp | 12 +++++-- src/runtime/NEON/functions/NEGEMMLowp.cpp | 12 +++++-- src/runtime/NEON/functions/NEGaussian5x5.cpp | 11 +++++-- src/runtime/NEON/functions/NEHOGDescriptor.cpp | 15 +++++++-- src/runtime/NEON/functions/NEHOGGradient.cpp | 13 ++++++-- src/runtime/NEON/functions/NEHOGMultiDetection.cpp | 37 +++++++++++++++------ src/runtime/NEON/functions/NEL2Normalize.cpp | 11 +++++-- .../NEON/functions/NELocallyConnectedLayer.cpp | 13 ++++++-- .../NEON/functions/NENormalizationLayer.cpp | 11 +++++-- src/runtime/NEON/functions/NEOpticalFlow.cpp | 13 ++++++-- src/runtime/NEON/functions/NEScale.cpp | 38 +++++++++++++++------- src/runtime/NEON/functions/NESobel5x5.cpp | 13 ++++++-- src/runtime/NEON/functions/NESobel7x7.cpp | 13 ++++++-- 33 files changed, 312 insertions(+), 75 deletions(-) diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h index fbf2d90740..b7e0ffbcf1 100644 --- a/arm_compute/runtime/NEON/functions/NECannyEdge.h +++ b/arm_compute/runtime/NEON/functions/NECannyEdge.h @@ -28,6 +28,8 @@ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" #include @@ -55,7 +57,7 @@ public: * * Initialize Sobel kernel to nullptr. */ - NECannyEdge(); + NECannyEdge(std::shared_ptr memory_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NECannyEdge(const NECannyEdge &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -80,6 +82,7 @@ public: void run() override; private: + MemoryGroup _memory_group; /**< Function's memory group */ std::unique_ptr _sobel; /**< Pointer to Sobel kernel */ std::unique_ptr _gradient; /**< Gradient kernel */ NEEdgeNonMaxSuppressionKernel _non_max_suppr; /**< Non-Maxima suppression kernel */ diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h index 1704d9fa94..9c0a906651 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolution.h +++ b/arm_compute/runtime/NEON/functions/NEConvolution.h @@ -28,10 +28,13 @@ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/INESimpleFunction.h" #include "arm_compute/runtime/Tensor.h" #include +#include namespace arm_compute { @@ -70,7 +73,7 @@ class NEConvolutionSquare : public IFunction { public: /** Default constructor */ - NEConvolutionSquare(); + NEConvolutionSquare(std::shared_ptr memory_manager = nullptr); /** Initialize the function's source, destination, conv and border_mode. * * @param[in,out] input Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED) @@ -86,6 +89,7 @@ public: void run() override; private: + MemoryGroup _memory_group; /**< Function memory group */ Tensor _tmp; /**< temporary buffer for output of horizontal pass */ bool _is_separable; /**< true if the convolution can be separated */ NESeparableConvolutionHorKernel _kernel_hor; /**< kernel for horizontal pass of separated convolution */ diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h index 9d2775ada6..daaf18f297 100644 --- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h @@ -29,8 +29,12 @@ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include + namespace arm_compute { /** Function to run the direct convolution. @@ -45,7 +49,7 @@ class NEDirectConvolutionLayer : public IFunction { public: /** Constructor */ - NEDirectConvolutionLayer(); + NEDirectConvolutionLayer(std::shared_ptr memory_manager = nullptr); /** Set the input, weights, biases and output tensors. * * @note: DirectConvolution only works in the following configurations: @@ -69,6 +73,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEDirectConvolutionLayerBiasAccumulateKernel _accumulate_bias_kernel; NEDirectConvolutionLayerKernel _conv_kernel; NEFillBorderKernel _input_border_handler; diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 3c8d7cf9b7..b4b9e8be01 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -30,8 +30,12 @@ #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include + namespace arm_compute { /** Basic function to execute GEMM on NEON. This function calls the following NEON kernels: @@ -46,7 +50,7 @@ class NEGEMM : public IFunction { public: /** Constructor */ - NEGEMM(); + NEGEMM(std::shared_ptr memory_manager = nullptr); /** Initialise the kernel's inputs, output * * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. @@ -65,6 +69,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEGEMMInterleave4x4Kernel _interleave_kernel; NEGEMMTranspose1xWKernel _transpose_kernel; NEGEMMMatrixMultiplyKernel _mm_kernel; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h index bfb1a494b8..0b0a7742f6 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h @@ -32,6 +32,10 @@ #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +#include namespace arm_compute { @@ -48,7 +52,7 @@ class NEGEMMLowp : public IFunction { public: /** Constructor */ - NEGEMMLowp(); + NEGEMMLowp(std::shared_ptr memory_manager = nullptr); /** Initialise the kernel's inputs, output * * @note GEMM_LOWP: low precision GEMM kernel @@ -75,6 +79,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEGEMMInterleave4x4Kernel _interleave_kernel; NEGEMMTranspose1xWKernel _transpose_kernel; NEGEMMLowpMatrixMultiplyKernel _mm_kernel; diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h index 699e42efb4..2aae3cb513 100644 --- a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h +++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h @@ -28,9 +28,12 @@ #include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" #include +#include namespace arm_compute { @@ -48,7 +51,7 @@ class NEGaussian5x5 : public IFunction public: /** Default constructor */ - NEGaussian5x5(); + NEGaussian5x5(std::shared_ptr memory_manager = nullptr); /** Initialise the function's input, output and border mode. * * @param[in, out] input Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED) @@ -62,6 +65,7 @@ public: void run() override; protected: + MemoryGroup _memory_group; /**< Function memory group */ NEGaussian5x5HorKernel _kernel_hor; /**< kernel for horizontal pass */ NEGaussian5x5VertKernel _kernel_vert; /**< kernel for vertical pass */ Tensor _tmp; /**< temporary buffer for output of horizontal pass */ diff --git a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h index b7b4909060..30989568e1 100644 --- a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h +++ b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h @@ -26,9 +26,13 @@ #include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h" #include "arm_compute/runtime/Tensor.h" +#include + namespace arm_compute { class IHOG; @@ -43,7 +47,7 @@ class NEHOGDescriptor : public IFunction { public: /** Default constructor */ - NEHOGDescriptor(); + NEHOGDescriptor(std::shared_ptr memory_manager = nullptr); /** Initialise the function's source, destination, HOG data-object and border mode * * @param[in, out] input Input tensor. Data type supported: U8 @@ -59,6 +63,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEHOGGradient _gradient; NEHOGOrientationBinningKernel _orient_bin; NEHOGBlockNormalizationKernel _block_norm; diff --git a/arm_compute/runtime/NEON/functions/NEHOGGradient.h b/arm_compute/runtime/NEON/functions/NEHOGGradient.h index dd2d99adfe..7e268411e1 100644 --- a/arm_compute/runtime/NEON/functions/NEHOGGradient.h +++ b/arm_compute/runtime/NEON/functions/NEHOGGradient.h @@ -27,6 +27,8 @@ #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEDerivative.h" #include "arm_compute/runtime/Tensor.h" @@ -46,7 +48,7 @@ class NEHOGGradient : public IFunction { public: /** Default constructor */ - NEHOGGradient(); + NEHOGGradient(std::shared_ptr memory_manager = nullptr); /** Initialise the function's source, destinations, phase type and border mode * * @param[in, out] input Input tensor. Data type supported: U8. @@ -63,6 +65,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEDerivative _derivative; std::unique_ptr _mag_phase; Tensor _gx; diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h index 2d07e6435f..0d268ca565 100644 --- a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h +++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h @@ -29,10 +29,14 @@ #include "arm_compute/core/IMultiHOG.h" #include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h" #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h" #include "arm_compute/runtime/Tensor.h" +#include + namespace arm_compute { /** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following NEON kernels: @@ -53,7 +57,7 @@ class NEHOGMultiDetection : public IFunction { public: /** Default constructor */ - NEHOGMultiDetection(); + NEHOGMultiDetection(std::shared_ptr memory_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEHOGMultiDetection(const NEHOGMultiDetection &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -85,6 +89,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEHOGGradient _gradient_kernel; std::unique_ptr _orient_bin_kernel; std::unique_ptr _block_norm_kernel; diff --git a/arm_compute/runtime/NEON/functions/NEL2Normalize.h b/arm_compute/runtime/NEON/functions/NEL2Normalize.h index 1297b99e79..95d5186c13 100644 --- a/arm_compute/runtime/NEON/functions/NEL2Normalize.h +++ b/arm_compute/runtime/NEON/functions/NEL2Normalize.h @@ -26,9 +26,13 @@ #include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/Tensor.h" +#include + namespace arm_compute { class ITensor; @@ -43,7 +47,7 @@ class NEL2Normalize : public IFunction { public: /** Constructor */ - NEL2Normalize(); + NEL2Normalize(std::shared_ptr memory_manager = nullptr); /** Set the input and output tensors. * * @param[in, out] input Source tensor. Data types supported: F32. (Written to only for border_size != 0) @@ -57,6 +61,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEReductionOperation _reduce_func; NEL2NormalizeKernel _normalize_kernel; Tensor _sumsq; diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h index efb2ff6c8b..18d2a1dfb5 100644 --- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h @@ -31,8 +31,12 @@ #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include + namespace arm_compute { class INETensor; @@ -48,7 +52,7 @@ class NELocallyConnectedLayer : public IFunction { public: /** Default constructor */ - NELocallyConnectedLayer(); + NELocallyConnectedLayer(std::shared_ptr memory_manager = nullptr); /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -66,6 +70,7 @@ public: void run() override; private: + MemoryGroup _memory_group; NEIm2ColKernel _input_im2col_kernel; NEWeightsReshapeKernel _weights_reshape_kernel; NELocallyConnectedMatrixMultiplyKernel _mm_kernel; diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h index 4cfea226f3..1c95c5bc4a 100644 --- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h @@ -29,9 +29,12 @@ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h" #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" -#include "arm_compute/core/Types.h" +#include namespace arm_compute { @@ -48,7 +51,7 @@ class NENormalizationLayer : public IFunction { public: /** Default constructor */ - NENormalizationLayer(); + NENormalizationLayer(std::shared_ptr memory_manager = nullptr); /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -62,6 +65,7 @@ public: void run() override; private: + MemoryGroup _memory_group; /**< Function memory group */ NENormalizationLayerKernel _norm_kernel; /**< Normalization layer kernel */ NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */ NEFillBorderKernel _border_handler; /**< Kernel to handle borders */ diff --git a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h index 0534551d19..5d1fbe3a22 100644 --- a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h +++ b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h @@ -29,6 +29,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/Array.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEScharr3x3.h" #include "arm_compute/runtime/Tensor.h" @@ -51,7 +53,7 @@ class NEOpticalFlow : public IFunction { public: /** Constructor */ - NEOpticalFlow(); + NEOpticalFlow(std::shared_ptr memory_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEOpticalFlow(const NEOpticalFlow &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -80,6 +82,7 @@ public: void run() override; private: + MemoryGroup _memory_group; std::unique_ptr _func_scharr; std::unique_ptr _kernel_tracker; std::unique_ptr _scharr_gx; diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h index e1da891dcf..91cda066e7 100644 --- a/arm_compute/runtime/NEON/functions/NEScale.h +++ b/arm_compute/runtime/NEON/functions/NEScale.h @@ -24,25 +24,30 @@ #ifndef __ARM_COMPUTE_NESCALEIMAGE_H__ #define __ARM_COMPUTE_NESCALEIMAGE_H__ +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEScaleKernel.h" #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunction.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" #include +#include namespace arm_compute { class ITensor; /** Basic function to run @ref NEScaleKernel */ -class NEScale : public INESimpleFunction +class NEScale : public IFunction { public: /** Constructor * * Initialize NEScale */ - NEScale(); + NEScale(std::shared_ptr memory_manager = nullptr); /** Initialize the function's source, destination, interpolation type and border_mode. * * @param[in, out] input Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED) @@ -53,10 +58,16 @@ public: */ void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); + // Inherited methods overridden: + void run() override; + private: - Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ - Tensor _dx; /**< Element's distance between the X real coordinate and the smallest X following integer */ - Tensor _dy; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + MemoryGroup _memory_group; /**< Function memory group */ + Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ + Tensor _dx; /**< Element's distance between the X real coordinate and the smallest X following integer */ + Tensor _dy; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + NEScaleKernel _scale_kernel; /**< Kernel to perform the scaling */ + NEFillBorderKernel _border_handler; /**< kernel to handle tensor borders */ }; } #endif /*__ARM_COMPUTE_NESCALEIMAGE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NESobel5x5.h b/arm_compute/runtime/NEON/functions/NESobel5x5.h index fc4d665a70..2b7cb70f15 100644 --- a/arm_compute/runtime/NEON/functions/NESobel5x5.h +++ b/arm_compute/runtime/NEON/functions/NESobel5x5.h @@ -28,9 +28,12 @@ #include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" #include +#include namespace arm_compute { @@ -47,7 +50,7 @@ class NESobel5x5 : public IFunction { public: /** Default constructor */ - NESobel5x5(); + NESobel5x5(std::shared_ptr memory_manager = nullptr); /** Initialise the function's source, destinations and border mode. * * @note At least one of output_x or output_y must be not NULL. @@ -65,6 +68,7 @@ public: void run() override; protected: + MemoryGroup _memory_group; /**< Function memory group */ NESobel5x5HorKernel _sobel_hor; /**< Sobel Horizontal 5x5 kernel */ NESobel5x5VertKernel _sobel_vert; /**< Sobel Vertical 5x5 kernel */ Tensor _tmp_x; /**< Temporary buffer for Sobel X */ diff --git a/arm_compute/runtime/NEON/functions/NESobel7x7.h b/arm_compute/runtime/NEON/functions/NESobel7x7.h index 06b7c80ad6..5f7bab7cfd 100644 --- a/arm_compute/runtime/NEON/functions/NESobel7x7.h +++ b/arm_compute/runtime/NEON/functions/NESobel7x7.h @@ -28,9 +28,12 @@ #include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" #include +#include namespace arm_compute { @@ -47,7 +50,7 @@ class NESobel7x7 : public IFunction { public: /** Default constructor */ - NESobel7x7(); + NESobel7x7(std::shared_ptr memory_manager = nullptr); /** Initialise the function's source, destinations and border mode. * * @note At least one of output_x or output_y must be not NULL. @@ -65,6 +68,7 @@ public: void run() override; protected: + MemoryGroup _memory_group; /**< Function memory group */ NESobel7x7HorKernel _sobel_hor; /**< Sobel Horizontal 7x7 kernel */ NESobel7x7VertKernel _sobel_vert; /**< Sobel Vertical 7x7 kernel */ Tensor _tmp_x; /**< Temporary buffer for Sobel X */ diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py index 61ada49149..e4daceb071 100755 --- a/scripts/clang_tidy_rules.py +++ b/scripts/clang_tidy_rules.py @@ -77,6 +77,7 @@ def filter_clang_tidy_lines( lines ): ("NEPoolingLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or ("NESoftmaxLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or ("parameter 'memory_manager' is unused" in line) or + ("parameter 'memory_manager' is copied for each invocation but only used as a const reference" in line) or "3rdparty" in line): print_context=False continue diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp index 318cea2342..9be1df6ea4 100644 --- a/src/runtime/NEON/functions/NECannyEdge.cpp +++ b/src/runtime/NEON/functions/NECannyEdge.cpp @@ -41,8 +41,9 @@ using namespace arm_compute; -NECannyEdge::NECannyEdge() // NOLINT - : _sobel(), +NECannyEdge::NECannyEdge(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _sobel(), _gradient(), _non_max_suppr(), _edge_trace(), @@ -93,6 +94,10 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, _phase.allocator()->init(info); _nonmax.allocator()->init(info); + // Manage intermediate buffers + _memory_group.manage(&_gx); + _memory_group.manage(&_gy); + // Configure/Init sobelNxN if(gradient_size == 3) { @@ -117,6 +122,10 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, ARM_COMPUTE_ERROR("Gradient size not supported\n"); } + // Manage intermediate buffers + _memory_group.manage(&_magnitude); + _memory_group.manage(&_phase); + // Configure gradient if(use_fp16) { @@ -131,6 +140,13 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, _gradient = std::move(k); } + // Allocate intermediate tensors + _gx.allocator()->allocate(); + _gy.allocator()->allocate(); + + // Manage intermediate buffers + _memory_group.manage(&_nonmax); + // Configure non-maxima suppression _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED); @@ -138,6 +154,10 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, // it. If border mode is undefined filling the border is a nop. _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value); + // Allocate intermediate tensors + _phase.allocator()->allocate(); + _magnitude.allocator()->allocate(); + // Configure edge tracing _edge_trace.configure(&_nonmax, output); @@ -145,10 +165,6 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0); // Allocate intermediate tensors - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); - _phase.allocator()->allocate(); - _magnitude.allocator()->allocate(); _nonmax.allocator()->allocate(); } @@ -157,6 +173,8 @@ void NECannyEdge::run() ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function"); ARM_COMPUTE_ERROR_ON(_output == nullptr); + _memory_group.acquire(); + // Run sobelNxN _sobel->run(); @@ -177,4 +195,6 @@ void NECannyEdge::run() // Run edge tracing NEScheduler::get().schedule(&_edge_trace, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp index 249274ba32..f10ffa6d14 100644 --- a/src/runtime/NEON/functions/NEConvolution.cpp +++ b/src/runtime/NEON/functions/NEConvolution.cpp @@ -48,8 +48,8 @@ void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t } template -NEConvolutionSquare::NEConvolutionSquare() - : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() +NEConvolutionSquare::NEConvolutionSquare(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() { } @@ -72,6 +72,10 @@ void NEConvolutionSquare::configure(ITensor *input, ITensor *output _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type)); + // Manage intermediate buffers + _memory_group.manage(&_tmp); + + // Calculate scale if(scale == 0) { scale = calculate_matrix_scale(conv, matrix_size); @@ -98,8 +102,12 @@ void NEConvolutionSquare::run() if(_is_separable) { + _memory_group.acquire(); + NEScheduler::get().schedule(&_kernel_hor, Window::DimY); NEScheduler::get().schedule(&_kernel_vert, Window::DimY); + + _memory_group.release(); } else { diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index 810efe539f..a56a73c44a 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -33,8 +33,8 @@ using namespace arm_compute; -NEDirectConvolutionLayer::NEDirectConvolutionLayer() - : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator() +NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator() { } @@ -46,6 +46,9 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, _accumulator.allocator()->free(); } + // Manage intermediate buffers + _memory_group.manage(&_accumulator); + // Allocate the intermediate accumulator tensor in case of fixed point input switch(output->info()->data_type()) { @@ -87,6 +90,10 @@ void NEDirectConvolutionLayer::run() { NEScheduler::get().schedule(&_input_border_handler, Window::DimZ); + _memory_group.acquire(); + NEScheduler::get().schedule(&_conv_kernel, Window::DimZ); NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index dfcb3954ea..85b283cd41 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -36,8 +36,8 @@ using namespace arm_compute; -NEGEMM::NEGEMM() - : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false) +NEGEMM::NEGEMM(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false) { } @@ -85,6 +85,10 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe _tmp_a.allocator()->init(info_a); _tmp_b.allocator()->init(info_b); + // Manage intermediate buffers + _memory_group.manage(&_tmp_a); + _memory_group.manage(&_tmp_b); + // Configure interleave kernel _interleave_kernel.configure(a, &_tmp_a); @@ -109,6 +113,8 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe void NEGEMM::run() { + _memory_group.acquire(); + if(!_run_vector_matrix_multiplication) { // Run interleave kernel @@ -121,6 +127,8 @@ void NEGEMM::run() // Run matrix multiply kernel NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY); + _memory_group.release(); + // Run matrix addition kernel if(_run_addition) { diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp index b64f769459..7413b28d03 100644 --- a/src/runtime/NEON/functions/NEGEMMLowp.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp @@ -34,8 +34,8 @@ using namespace arm_compute; -NEGEMMLowp::NEGEMMLowp() - : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b() +NEGEMMLowp::NEGEMMLowp(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b() { } @@ -63,6 +63,10 @@ void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, _tmp_a.allocator()->init(info_a); _tmp_b.allocator()->init(info_b); + // Manage intermediate buffers + _memory_group.manage(&_tmp_a); + _memory_group.manage(&_tmp_b); + _interleave_kernel.configure(a, &_tmp_a); _transpose_kernel.configure(b, &_tmp_b); _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift); @@ -73,6 +77,8 @@ void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, void NEGEMMLowp::run() { + _memory_group.acquire(); + /* Run interleave kernel */ NEScheduler::get().schedule(&_interleave_kernel, Window::DimY); @@ -81,4 +87,6 @@ void NEGEMMLowp::run() /* Run matrix multiply kernel */ NEScheduler::get().schedule(&_mm_kernel, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp index a1ce985633..f085975b1e 100644 --- a/src/runtime/NEON/functions/NEGaussian5x5.cpp +++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp @@ -32,8 +32,8 @@ using namespace arm_compute; -NEGaussian5x5::NEGaussian5x5() - : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler() +NEGaussian5x5::NEGaussian5x5(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler() { } @@ -43,6 +43,9 @@ void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16); _tmp.allocator()->init(tensor_info); + // Manage intermediate buffers + _memory_group.manage(&_tmp); + // Create and configure kernels for the two passes _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED); _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED); @@ -54,7 +57,11 @@ void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border void NEGaussian5x5::run() { + _memory_group.acquire(); + NEScheduler::get().schedule(&_border_handler, Window::DimZ); NEScheduler::get().schedule(&_kernel_hor, Window::DimY); NEScheduler::get().schedule(&_kernel_vert, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp index a592f53d44..5e98269f47 100644 --- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp +++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp @@ -31,8 +31,8 @@ using namespace arm_compute; -NEHOGDescriptor::NEHOGDescriptor() - : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space() +NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space() { } @@ -71,9 +71,16 @@ void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); _hog_space.allocator()->init(info_space); + // Manage intermediate buffers + _memory_group.manage(&_mag); + _memory_group.manage(&_phase); + // Initialise gradient kernel _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value); + // Manage intermediate buffers + _memory_group.manage(&_hog_space); + // Initialise orientation binning kernel _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info()); @@ -88,6 +95,8 @@ void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog void NEHOGDescriptor::run() { + _memory_group.acquire(); + // Run gradient _gradient.run(); @@ -96,4 +105,6 @@ void NEHOGDescriptor::run() // Run block normalization kernel NEScheduler::get().schedule(&_block_norm, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp index 3e2640d631..efc8690ede 100644 --- a/src/runtime/NEON/functions/NEHOGGradient.cpp +++ b/src/runtime/NEON/functions/NEHOGGradient.cpp @@ -30,8 +30,9 @@ using namespace arm_compute; -NEHOGGradient::NEHOGGradient() // NOLINT - : _derivative(), +NEHOGGradient::NEHOGGradient(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _derivative(), _mag_phase(nullptr), _gx(), _gy() @@ -51,6 +52,10 @@ void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor _gx.allocator()->init(info); _gy.allocator()->init(info); + // Manage intermediate buffers + _memory_group.manage(&_gx); + _memory_group.manage(&_gy); + // Initialise derivate kernel _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value); @@ -75,9 +80,13 @@ void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor void NEHOGGradient::run() { + _memory_group.acquire(); + // Run derivative _derivative.run(); // Run magnitude/phase kernel NEScheduler::get().schedule(_mag_phase.get(), Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp index 1a038a2f62..8c834e2a93 100644 --- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp +++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp @@ -32,8 +32,9 @@ using namespace arm_compute; -NEHOGMultiDetection::NEHOGMultiDetection() // NOLINT - : _gradient_kernel(), +NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), @@ -139,6 +140,10 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, TensorInfo info_phase(shape_img, Format::U8); _phase.allocator()->init(info_phase); + // Manage intermediate buffers + _memory_group.manage(&_mag); + _memory_group.manage(&_phase); + // Initialise gradient kernel _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value); @@ -164,10 +169,17 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); _hog_space[i].allocator()->init(info_space); + // Manage intermediate buffers + _memory_group.manage(_hog_space.get() + i); + // Initialise orientation binning kernel _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info()); } + // Allocate intermediate tensors + _mag.allocator()->allocate(); + _phase.allocator()->allocate(); + // Configure NETensor for the normalized HOG space and block normalization kernel for(size_t i = 0; i < _num_block_norm_kernel; ++i) { @@ -178,10 +190,19 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height); _hog_norm_space[i].allocator()->init(tensor_info); + // Manage intermediate buffers + _memory_group.manage(_hog_norm_space.get() + i); + // Initialize block normalization kernel _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info()); } + // Allocate intermediate tensors + for(size_t i = 0; i < _num_orient_bin_kernel; ++i) + { + _hog_space[i].allocator()->allocate(); + } + // Configure HOG detector kernel for(size_t i = 0; i < _num_hog_detect_kernel; ++i) { @@ -194,14 +215,6 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, _non_maxima_kernel->configure(_detection_windows, min_distance); // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - _hog_space[i].allocator()->allocate(); - } - for(size_t i = 0; i < _num_block_norm_kernel; ++i) { _hog_norm_space[i].allocator()->allocate(); @@ -212,6 +225,8 @@ void NEHOGMultiDetection::run() { ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function"); + _memory_group.acquire(); + // Reset detection window _detection_windows->clear(); @@ -241,4 +256,6 @@ void NEHOGMultiDetection::run() { NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY); } + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2Normalize.cpp index 378d78e3f3..349a781b0b 100644 --- a/src/runtime/NEON/functions/NEL2Normalize.cpp +++ b/src/runtime/NEON/functions/NEL2Normalize.cpp @@ -28,13 +28,16 @@ using namespace arm_compute; -NEL2Normalize::NEL2Normalize() - : _reduce_func(), _normalize_kernel(), _sumsq() +NEL2Normalize::NEL2Normalize(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() { } void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon) { + // Manage intermediate buffers + _memory_group.manage(&_sumsq); + // Configure Kernels _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE); _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); @@ -45,6 +48,10 @@ void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis void NEL2Normalize::run() { + _memory_group.acquire(); + _reduce_func.run(); NEScheduler::get().schedule(&_normalize_kernel, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp index e7c71e04d1..cb48598921 100644 --- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp @@ -33,8 +33,9 @@ using namespace arm_compute; -NELocallyConnectedLayer::NELocallyConnectedLayer() - : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false) +NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), + _is_first_run(false) { } @@ -102,6 +103,10 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei shape_gemm.set(1, mat_input_rows); _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type())); + // Manage intermediate buffers + _memory_group.manage(&_input_im2col_reshaped); + _memory_group.manage(&_gemm_output); + // Configure kernels _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias); _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped); @@ -123,6 +128,8 @@ void NELocallyConnectedLayer::run() NEScheduler::get().schedule(&_weights_reshape_kernel, 3); } + _memory_group.acquire(); + // Run input reshaping NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY); @@ -131,4 +138,6 @@ void NELocallyConnectedLayer::run() // Reshape output matrix NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index 69ff32591f..e01ef6660d 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -32,8 +32,8 @@ using namespace arm_compute; -NENormalizationLayer::NENormalizationLayer() - : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared() +NENormalizationLayer::NENormalizationLayer(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared() { } @@ -44,6 +44,9 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, Norm TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); _input_squared.allocator()->init(tensor_info); + // Manage intermediate buffers + _memory_group.manage(&_input_squared); + // Configure kernels _norm_kernel.configure(input, &_input_squared, output, norm_info); _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); @@ -55,7 +58,11 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, Norm void NENormalizationLayer::run() { + _memory_group.acquire(); + NEScheduler::get().schedule(&_multiply_kernel, Window::DimY); NEScheduler::get().schedule(&_border_handler, Window::DimY); NEScheduler::get().schedule(&_norm_kernel, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp index 3e69a33897..e90d8f6270 100644 --- a/src/runtime/NEON/functions/NEOpticalFlow.cpp +++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp @@ -37,8 +37,9 @@ using namespace arm_compute; -NEOpticalFlow::NEOpticalFlow() // NOLINT - : _func_scharr(), +NEOpticalFlow::NEOpticalFlow(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _func_scharr(), _kernel_tracker(), _scharr_gx(), _scharr_gy(), @@ -97,6 +98,10 @@ void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyr _scharr_gx[i].allocator()->init(tensor_info); _scharr_gy[i].allocator()->init(tensor_info); + // Manage intermediate buffers + _memory_group.manage(_scharr_gx.get() + i); + _memory_group.manage(_scharr_gy.get() + i); + // Init Scharr kernel _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value); @@ -116,6 +121,8 @@ void NEOpticalFlow::run() { ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function"); + _memory_group.acquire(); + for(unsigned int level = _num_levels; level > 0; --level) { // Run Scharr kernel @@ -124,4 +131,6 @@ void NEOpticalFlow::run() // Run Lucas-Kanade kernel NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX); } + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index 7fc352ab1f..6c5ac3c45b 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEScaleKernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Window.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/TensorAllocator.h" #include "support/ToolchainSupport.h" @@ -86,10 +86,13 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float } } // namespace -NEScale::NEScale() // NOLINT - : _offsets(), +NEScale::NEScale(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _offsets(), _dx(), - _dy() + _dy(), + _scale_kernel(), + _border_handler() { } @@ -119,8 +122,6 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol policy = InterpolationPolicy::NEAREST_NEIGHBOR; } - auto k = arm_compute::support::cpp14::make_unique(); - // Check if the border mode is UNDEFINED const bool border_undefined = border_mode == BorderMode::UNDEFINED; @@ -130,8 +131,9 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol { TensorInfo tensor_info_offsets(shape, Format::S32); _offsets.allocator()->init(tensor_info_offsets); + _memory_group.manage(&_offsets); - k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined); + _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined); // Allocate once the configure methods have been called _offsets.allocator()->allocate(); @@ -149,7 +151,12 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol _dx.allocator()->init(tensor_info_dxdy); _dy.allocator()->init(tensor_info_dxdy); - k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined); + // Manage intermediate buffers + _memory_group.manage(&_offsets); + _memory_group.manage(&_dx); + _memory_group.manage(&_dy); + + _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined); // Allocate once the configure methods have been called _offsets.allocator()->allocate(); @@ -162,13 +169,22 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol } case InterpolationPolicy::AREA: { - k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined); + _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined); break; } default: ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } - _kernel = std::move(k); - _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value)); +} + +void NEScale::run() +{ + _memory_group.acquire(); + + NEScheduler::get().schedule(&_border_handler, Window::DimZ); + NEScheduler::get().schedule(&_scale_kernel, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp index 305d21122e..d8f4eda2ff 100644 --- a/src/runtime/NEON/functions/NESobel5x5.cpp +++ b/src/runtime/NEON/functions/NESobel5x5.cpp @@ -32,8 +32,8 @@ using namespace arm_compute; -NESobel5x5::NESobel5x5() - : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler() +NESobel5x5::NESobel5x5(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler() { } @@ -50,6 +50,8 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, { _tmp_x.allocator()->init(tensor_info); _tmp_y.allocator()->init(tensor_info); + _memory_group.manage(&_tmp_x); + _memory_group.manage(&_tmp_y); _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); @@ -58,6 +60,7 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, else if(run_sobel_x) { _tmp_x.allocator()->init(tensor_info); + _memory_group.manage(&_tmp_x); _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); @@ -65,6 +68,7 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, else if(run_sobel_y) { _tmp_y.allocator()->init(tensor_info); + _memory_group.manage(&_tmp_y); _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); _tmp_y.allocator()->allocate(); @@ -76,6 +80,11 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, void NESobel5x5::run() { NEScheduler::get().schedule(&_border_handler, Window::DimZ); + + _memory_group.acquire(); + NEScheduler::get().schedule(&_sobel_hor, Window::DimY); NEScheduler::get().schedule(&_sobel_vert, Window::DimY); + + _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp index 57fe028567..5b6f60b338 100644 --- a/src/runtime/NEON/functions/NESobel7x7.cpp +++ b/src/runtime/NEON/functions/NESobel7x7.cpp @@ -32,8 +32,8 @@ using namespace arm_compute; -NESobel7x7::NESobel7x7() - : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler() +NESobel7x7::NESobel7x7(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler() { } @@ -50,6 +50,8 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, { _tmp_x.allocator()->init(tensor_info); _tmp_y.allocator()->init(tensor_info); + _memory_group.manage(&_tmp_x); + _memory_group.manage(&_tmp_y); _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); @@ -58,6 +60,7 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, else if(run_sobel_x) { _tmp_x.allocator()->init(tensor_info); + _memory_group.manage(&_tmp_x); _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); _tmp_x.allocator()->allocate(); @@ -65,6 +68,7 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, else if(run_sobel_y) { _tmp_y.allocator()->init(tensor_info); + _memory_group.manage(&_tmp_y); _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); _tmp_y.allocator()->allocate(); @@ -76,6 +80,11 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, void NESobel7x7::run() { NEScheduler::get().schedule(&_border_handler, Window::DimZ); + + _memory_group.acquire(); + NEScheduler::get().schedule(&_sobel_hor, Window::DimY); NEScheduler::get().schedule(&_sobel_vert, Window::DimY); + + _memory_group.release(); } -- cgit v1.2.1