From 1562be3e8a449360a90af75f6f1481a30d41be75 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 8 Mar 2018 19:09:19 +0000 Subject: COMPMID-998: Release unused trainable parameters. Change-Id: I361a520f34080016a25bc86e1e6789777c5152c1 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124432 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- arm_compute/core/ITensor.h | 10 ++++++++++ arm_compute/graph2/ITensorHandle.h | 21 ++++++++++++++------ arm_compute/graph2/backends/CL/CLSubTensorHandle.h | 1 + arm_compute/graph2/backends/CL/CLTensorHandle.h | 1 + arm_compute/graph2/backends/GLES/GCTensorHandle.h | 1 + .../graph2/backends/NEON/NESubTensorHandle.h | 1 + arm_compute/graph2/backends/NEON/NETensorHandle.h | 1 + arm_compute/graph2/detail/ExecutionHelpers.h | 5 +++++ .../CL/functions/CLDepthwiseConvolutionLayer.h | 10 ++++++++++ .../runtime/CL/functions/CLFullyConnectedLayer.h | 11 ++++++++++- .../runtime/CL/functions/CLGEMMConvolutionLayer.h | 10 ++++++++++ .../runtime/CL/functions/CLLocallyConnectedLayer.h | 9 +++++++++ .../NEON/functions/NEDepthwiseConvolutionLayer.h | 10 ++++++++++ .../runtime/NEON/functions/NEFullyConnectedLayer.h | 11 ++++++++++- .../NEON/functions/NEGEMMConvolutionLayer.h | 11 ++++++++++- .../NEON/functions/NELocallyConnectedLayer.h | 9 +++++++++ src/core/ITensor.cpp | 12 ++++++++++- src/graph2/GraphManager.cpp | 6 ++++++ src/graph2/backends/CL/CLSubTensorHandle.cpp | 5 +++++ src/graph2/backends/CL/CLTensorHandle.cpp | 9 +++++++++ src/graph2/backends/GLES/GCTensorHandle.cpp | 9 +++++++++ src/graph2/backends/NEON/NESubTensorHandle.cpp | 5 +++++ src/graph2/backends/NEON/NETensorHandle.cpp | 9 +++++++++ src/graph2/detail/ExecutionHelpers.cpp | 11 +++++++++++ .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 23 +++++++++++++++------- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 8 +++++++- .../CL/functions/CLGEMMConvolutionLayer.cpp | 14 +++++++++---- .../CL/functions/CLLocallyConnectedLayer.cpp | 12 ++++++++--- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 21 ++++++++++++++++---- .../NEON/functions/NEFullyConnectedLayer.cpp | 8 +++++++- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 8 +++++++- .../NEON/functions/NELocallyConnectedLayer.cpp | 12 ++++++++--- 32 files changed, 260 insertions(+), 34 deletions(-) diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h index 1ef9c6d3f6..255a236861 100644 --- a/arm_compute/core/ITensor.h +++ b/arm_compute/core/ITensor.h @@ -83,6 +83,16 @@ public: * @param io_fmt Format information */ void print(std::ostream &s, IOFormatInfo io_fmt = IOFormatInfo()) const; + /** Flags if the tensor is used or not + * + * @return True if it is used else false + */ + bool is_used() const; + /** Marks a tensor as unused */ + void mark_as_unused() const; + +private: + mutable bool _is_used = { true }; /**< Flag that marks if the tensor is used or not */ }; using IImage = ITensor; diff --git a/arm_compute/graph2/ITensorHandle.h b/arm_compute/graph2/ITensorHandle.h index 68f79d8b01..c92a213121 100644 --- a/arm_compute/graph2/ITensorHandle.h +++ b/arm_compute/graph2/ITensorHandle.h @@ -30,25 +30,34 @@ namespace arm_compute { namespace graph2 { -/** Tensor handle interface object **/ +/** Tensor handle interface object */ class ITensorHandle { public: - /** Default virtual destructor **/ + /** Default virtual destructor */ virtual ~ITensorHandle() = default; - /** Allocates backend memory for the handle **/ + /** Allocates backend memory for the handle */ virtual void allocate() = 0; - /** Backend tensor object accessor **/ + /** Backend tensor object accessor */ virtual arm_compute::ITensor &tensor() = 0; - /** Backend tensor object const accessor **/ + /** Backend tensor object const accessor */ virtual const arm_compute::ITensor &tensor() const = 0; /** Maps backend tensor object * * @param[in] blocking Flags if the mapping operations should be blocking */ virtual void map(bool blocking) = 0; - /** Un-maps a backend tensor object **/ + /** Un-maps a backend tensor object */ virtual void unmap() = 0; + /** Releases backend tensor if is marked as unused + * + * + * @note This has no effect on sub-tensors + * @warning Parent tensors don't keep track of sub-tensors, + * thus if a parent is set as unused then all sub-tensors will be invalidated, + * on the other hand if a sub-tensor is marked as unused then the parent tensor won't be released + */ + virtual void release_if_unused() = 0; /** Checks if a backing tensor is a sub-tensor object or not * * @return True if the backend tensor is a sub-tensor else false diff --git a/arm_compute/graph2/backends/CL/CLSubTensorHandle.h b/arm_compute/graph2/backends/CL/CLSubTensorHandle.h index 9910980e59..6f3c00c041 100644 --- a/arm_compute/graph2/backends/CL/CLSubTensorHandle.h +++ b/arm_compute/graph2/backends/CL/CLSubTensorHandle.h @@ -59,6 +59,7 @@ public: const arm_compute::ITensor &tensor() const override; void map(bool blocking) override; void unmap() override; + void release_if_unused() override; bool is_subtensor() const override; private: diff --git a/arm_compute/graph2/backends/CL/CLTensorHandle.h b/arm_compute/graph2/backends/CL/CLTensorHandle.h index 37d7147b6b..0b20d1d8fc 100644 --- a/arm_compute/graph2/backends/CL/CLTensorHandle.h +++ b/arm_compute/graph2/backends/CL/CLTensorHandle.h @@ -56,6 +56,7 @@ public: const arm_compute::ITensor &tensor() const override; void map(bool blocking) override; void unmap() override; + void release_if_unused() override; bool is_subtensor() const override; private: diff --git a/arm_compute/graph2/backends/GLES/GCTensorHandle.h b/arm_compute/graph2/backends/GLES/GCTensorHandle.h index 8ead236614..281adee428 100644 --- a/arm_compute/graph2/backends/GLES/GCTensorHandle.h +++ b/arm_compute/graph2/backends/GLES/GCTensorHandle.h @@ -56,6 +56,7 @@ public: const arm_compute::ITensor &tensor() const override; void map(bool blocking) override; void unmap() override; + void release_if_unused() override; bool is_subtensor() const override; private: diff --git a/arm_compute/graph2/backends/NEON/NESubTensorHandle.h b/arm_compute/graph2/backends/NEON/NESubTensorHandle.h index eacdfe0fb4..d62b66f343 100644 --- a/arm_compute/graph2/backends/NEON/NESubTensorHandle.h +++ b/arm_compute/graph2/backends/NEON/NESubTensorHandle.h @@ -59,6 +59,7 @@ public: const arm_compute::ITensor &tensor() const override; void map(bool blocking) override; void unmap() override; + void release_if_unused() override; bool is_subtensor() const override; private: diff --git a/arm_compute/graph2/backends/NEON/NETensorHandle.h b/arm_compute/graph2/backends/NEON/NETensorHandle.h index c22fcdf216..23fd7ccc69 100644 --- a/arm_compute/graph2/backends/NEON/NETensorHandle.h +++ b/arm_compute/graph2/backends/NEON/NETensorHandle.h @@ -56,6 +56,7 @@ public: const arm_compute::ITensor &tensor() const override; void map(bool blocking) override; void unmap() override; + void release_if_unused() override; bool is_subtensor() const override; private: diff --git a/arm_compute/graph2/detail/ExecutionHelpers.h b/arm_compute/graph2/detail/ExecutionHelpers.h index e4523ecf47..bc3cfd5d72 100644 --- a/arm_compute/graph2/detail/ExecutionHelpers.h +++ b/arm_compute/graph2/detail/ExecutionHelpers.h @@ -63,6 +63,11 @@ void validate_all_nodes(Graph &g); * @return The execution workload */ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx); +/** Release the memory of all unused const nodes + * + * @param[in] g Graph to release the memory from + */ +void release_unused_tensors(Graph &g); /** Calls accessor of a given tensor * * @param[in] tensor The tensor of which the accessor should be called diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index eb12fe4cca..d6fc8f0fcc 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -83,6 +83,14 @@ class CLDepthwiseConvolutionLayer : public IFunction public: /** Default constructor */ CLDepthwiseConvolutionLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthwiseConvolutionLayer(const CLDepthwiseConvolutionLayer &) = delete; + /** Default move constructor */ + CLDepthwiseConvolutionLayer(CLDepthwiseConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthwiseConvolutionLayer &operator=(const CLDepthwiseConvolutionLayer &) = delete; + /** Default move assignment operator */ + CLDepthwiseConvolutionLayer &operator=(CLDepthwiseConvolutionLayer &&) = default; /** Initialize the function's source, destination, weights and convolution information. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling). @@ -109,7 +117,9 @@ private: CLTensor _weights_reshaped; CLTensor _v2mm_output; CLTensor _output_reshaped; + bool _is_first_run; bool _is_quantized; + const ICLTensor *_original_weights; }; } #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 1e9ee492ad..ad821318e9 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -76,6 +76,14 @@ class CLFullyConnectedLayer : public IFunction public: /** Constructor */ CLFullyConnectedLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedLayer(const CLFullyConnectedLayer &) = delete; + /** Default move constructor */ + CLFullyConnectedLayer(CLFullyConnectedLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedLayer &operator=(const CLFullyConnectedLayer &) = delete; + /** Default move assignment operator */ + CLFullyConnectedLayer &operator=(CLFullyConnectedLayer &&) = default; /** Set the input and output tensors. * * @param[in] input Source tensor. Data type supported: QS8/QASYMM8/QS16/F16/F32. @@ -121,6 +129,7 @@ private: bool _is_fc_after_conv; bool _accumulate_biases; bool _is_quantized; + const ICLTensor *_original_weights; }; } #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h index dc1211a51a..91d35ca0b2 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h @@ -103,6 +103,14 @@ public: * @param[in] memory_manager (Optional) Memory manager. */ CLGEMMConvolutionLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMConvolutionLayer(const CLGEMMConvolutionLayer &) = delete; + /** Default move constructor */ + CLGEMMConvolutionLayer(CLGEMMConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMConvolutionLayer &operator=(const CLGEMMConvolutionLayer &) = delete; + /** Default move assignment operator */ + CLGEMMConvolutionLayer &operator=(CLGEMMConvolutionLayer &&) = default; /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -172,6 +180,8 @@ private: CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; CLCol2ImKernel _col2im_kernel; + const ICLTensor *_original_weights; + CLTensor _im2col_output; CLTensor _weights_reshaped; CLTensor _gemm_output; diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h index e5aafd8547..b7b2587454 100644 --- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h @@ -53,6 +53,14 @@ class CLLocallyConnectedLayer : public IFunction public: /** Default constructor */ CLLocallyConnectedLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLocallyConnectedLayer(const CLLocallyConnectedLayer &) = delete; + /** Default move constructor */ + CLLocallyConnectedLayer(CLLocallyConnectedLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLocallyConnectedLayer &operator=(const CLLocallyConnectedLayer &) = delete; + /** Default move assignment operator */ + CLLocallyConnectedLayer &operator=(CLLocallyConnectedLayer &&) = default; /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -93,6 +101,7 @@ private: CLTensor _weights_reshaped; CLTensor _gemm_output; bool _is_first_run; + const ICLTensor *_original_weights; }; } #endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index f5805f5ad1..fe65ac1a43 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -97,6 +97,14 @@ class NEDepthwiseConvolutionLayer : public IFunction public: /** Default constructor */ NEDepthwiseConvolutionLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayer(const NEDepthwiseConvolutionLayer &) = delete; + /** Default move constructor */ + NEDepthwiseConvolutionLayer(NEDepthwiseConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayer &operator=(const NEDepthwiseConvolutionLayer &) = delete; + /** Default move assignment operator */ + NEDepthwiseConvolutionLayer &operator=(NEDepthwiseConvolutionLayer &&) = default; /** Initialize the function's source, destination, weights and convolution information. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling). @@ -123,7 +131,9 @@ private: Tensor _weights_reshaped; Tensor _v2mm_output; Tensor _output_reshaped; + bool _is_first_run; bool _is_quantized; + const ITensor *_original_weights; }; } #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ */ \ No newline at end of file diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index 9bc8d21fc4..071eecc3f7 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -94,6 +94,14 @@ class NEFullyConnectedLayer : public IFunction public: /** Constructor */ NEFullyConnectedLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete; + /** Default move constructor */ + NEFullyConnectedLayer(NEFullyConnectedLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayer &operator=(const NEFullyConnectedLayer &) = delete; + /** Default move assignment operator */ + NEFullyConnectedLayer &operator=(NEFullyConnectedLayer &&) = default; /** Set the input and output tensors. * * @param[in] input Source tensor. Data type supported: QS8/QS16/F32. @@ -134,6 +142,7 @@ private: bool _is_batched_fc_layer; bool _linearize_input; bool _accumulate_biases; + const ITensor *_original_weights; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 43e9304414..e733fec4b6 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -101,7 +101,14 @@ class NEGEMMConvolutionLayer : public IFunction public: /** Constructor */ NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager = nullptr); - + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete; + /** Default move constructor */ + NEGEMMConvolutionLayer(NEGEMMConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMConvolutionLayer &operator=(const NEGEMMConvolutionLayer &) = delete; + /** Default move assignment operator */ + NEGEMMConvolutionLayer &operator=(NEGEMMConvolutionLayer &&) = default; /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -165,6 +172,8 @@ private: NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; NECol2ImKernel _output_col2im_kernel; + const ITensor *_original_weights; + Tensor _input_im2col_reshaped; Tensor _input_interleaved_reshaped; Tensor _weights_reshaped; diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h index e07f09c588..18cd27414e 100644 --- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h @@ -53,6 +53,14 @@ class NELocallyConnectedLayer : public IFunction public: /** Default constructor */ NELocallyConnectedLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELocallyConnectedLayer(const NELocallyConnectedLayer &) = delete; + /** Default move constructor */ + NELocallyConnectedLayer(NELocallyConnectedLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELocallyConnectedLayer &operator=(const NELocallyConnectedLayer &) = delete; + /** Default move assignment operator */ + NELocallyConnectedLayer &operator=(NELocallyConnectedLayer &&) = default; /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -93,6 +101,7 @@ private: Tensor _weights_reshaped; Tensor _gemm_output; bool _is_first_run; + const ITensor *_original_weights; }; } #endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__ */ diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp index b65c4f4c97..eb5f072d82 100644 --- a/src/core/ITensor.cpp +++ b/src/core/ITensor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -151,3 +151,13 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const } } } + +bool ITensor::is_used() const +{ + return _is_used; +} + +void ITensor::mark_as_unused() const +{ + _is_used = false; +} diff --git a/src/graph2/GraphManager.cpp b/src/graph2/GraphManager.cpp index e708dc6a61..a51ba61104 100644 --- a/src/graph2/GraphManager.cpp +++ b/src/graph2/GraphManager.cpp @@ -82,6 +82,12 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager & // Finalize Graph context ctx.finalize(); + + // Make first run + execute_graph(graph); + + // Release all unused const nodes + detail::release_unused_tensors(graph); } void GraphManager::execute_graph(Graph &graph) diff --git a/src/graph2/backends/CL/CLSubTensorHandle.cpp b/src/graph2/backends/CL/CLSubTensorHandle.cpp index a001d57832..65a1ba4d5f 100644 --- a/src/graph2/backends/CL/CLSubTensorHandle.cpp +++ b/src/graph2/backends/CL/CLSubTensorHandle.cpp @@ -64,6 +64,11 @@ void CLSubTensorHandle::unmap() _sub_tensor.unmap(); } +void CLSubTensorHandle::release_if_unused() +{ + // noop +} + bool CLSubTensorHandle::is_subtensor() const { return true; diff --git a/src/graph2/backends/CL/CLTensorHandle.cpp b/src/graph2/backends/CL/CLTensorHandle.cpp index f515e0bac3..89678fb280 100644 --- a/src/graph2/backends/CL/CLTensorHandle.cpp +++ b/src/graph2/backends/CL/CLTensorHandle.cpp @@ -60,6 +60,15 @@ void CLTensorHandle::unmap() _tensor.unmap(); } +void CLTensorHandle::release_if_unused() +{ + // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used + if(!_tensor.is_used()) + { + _tensor.allocator()->free(); + } +} + bool CLTensorHandle::is_subtensor() const { return false; diff --git a/src/graph2/backends/GLES/GCTensorHandle.cpp b/src/graph2/backends/GLES/GCTensorHandle.cpp index aa9ac8c2e9..2165cd2de6 100644 --- a/src/graph2/backends/GLES/GCTensorHandle.cpp +++ b/src/graph2/backends/GLES/GCTensorHandle.cpp @@ -60,6 +60,15 @@ void GCTensorHandle::unmap() _tensor.unmap(); } +void GCTensorHandle::release_if_unused() +{ + // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used + if(!_tensor.is_used()) + { + _tensor.allocator()->free(); + } +} + bool GCTensorHandle::is_subtensor() const { return false; diff --git a/src/graph2/backends/NEON/NESubTensorHandle.cpp b/src/graph2/backends/NEON/NESubTensorHandle.cpp index 491cf8259c..1cd15be29c 100644 --- a/src/graph2/backends/NEON/NESubTensorHandle.cpp +++ b/src/graph2/backends/NEON/NESubTensorHandle.cpp @@ -61,6 +61,11 @@ void NESubTensorHandle::unmap() // noop } +void NESubTensorHandle::release_if_unused() +{ + // noop +} + bool NESubTensorHandle::is_subtensor() const { return true; diff --git a/src/graph2/backends/NEON/NETensorHandle.cpp b/src/graph2/backends/NEON/NETensorHandle.cpp index a4af8aaf9b..0b901c3497 100644 --- a/src/graph2/backends/NEON/NETensorHandle.cpp +++ b/src/graph2/backends/NEON/NETensorHandle.cpp @@ -59,6 +59,15 @@ void NETensorHandle::unmap() { } +void NETensorHandle::release_if_unused() +{ + // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used + if(!_tensor.is_used()) + { + _tensor.allocator()->free(); + } +} + bool NETensorHandle::is_subtensor() const { return false; diff --git a/src/graph2/detail/ExecutionHelpers.cpp b/src/graph2/detail/ExecutionHelpers.cpp index ae214ad5a6..3688d0b0dc 100644 --- a/src/graph2/detail/ExecutionHelpers.cpp +++ b/src/graph2/detail/ExecutionHelpers.cpp @@ -135,6 +135,17 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx) return workload; } +void release_unused_tensors(Graph &g) +{ + for(auto &tensor : g.tensors()) + { + if(tensor != nullptr && tensor->handle() != nullptr) + { + tensor->handle()->release_if_unused(); + } + } +} + void call_tensor_accessor(Tensor *tensor) { ARM_COMPUTE_ERROR_ON(!tensor); diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 112af60f35..8d7c92bdf1 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -64,7 +64,7 @@ void CLDepthwiseConvolutionLayer3x3::run() CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer() : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), - _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false) + _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr) { } @@ -78,7 +78,9 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w const size_t weights_h = weights->info()->dimension(1); const size_t weights_z = weights->info()->dimension(2); - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _is_first_run = true; + _original_weights = weights; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); bool append_bias = (biases != nullptr) && !_is_quantized; const GPUTarget gpu_target = CLScheduler::get().target(); @@ -154,16 +156,23 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w void CLDepthwiseConvolutionLayer::run() { - CLScheduler::get().enqueue(_im2col_kernel); + // Run weights reshaping (Runs once for every configure) + if(_is_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - CLScheduler::get().enqueue(_weights_reshape_kernel); + CLScheduler::get().enqueue(_weights_reshape_kernel); + CLScheduler::get().enqueue(_v2mm_weights_fill_border); + _is_first_run = false; + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); + } + CLScheduler::get().enqueue(_im2col_kernel); CLScheduler::get().enqueue(_v2mm_input_fill_border); - CLScheduler::get().enqueue(_v2mm_weights_fill_border); CLScheduler::get().enqueue(_v2mm_kernel); - CLScheduler::get().enqueue(_vector_to_tensor_kernel); - if(_is_quantized) { CLScheduler::get().enqueue(_output_stage_kernel); diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 2b4670b98c..676706fb17 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -76,7 +76,7 @@ Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(), - _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false) + _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _original_weights(nullptr) { } @@ -152,6 +152,7 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w _is_fc_after_conv = true; _accumulate_biases = false; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _original_weights = weights; // Configure gemmlowp output if(_is_quantized) @@ -316,8 +317,13 @@ void CLFullyConnectedLayer::run() // Reshape of the weights (happens only once) if(!_are_weights_reshaped) { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _are_weights_reshaped = true; _reshape_weights_kernel.run(); + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); } _memory_group.acquire(); diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index e7ad62f5ff..f43e100565 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -90,8 +90,8 @@ void CLConvolutionLayerReshapeWeights::run() } CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _im2col_output(), - _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true) + : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _original_weights(nullptr), + _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true) { } @@ -164,7 +164,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * weights_info, dilation)); - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _is_first_run = true; + _original_weights = weights; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const DataType dt = input->info()->data_type(); @@ -349,9 +351,13 @@ void CLGEMMConvolutionLayer::run() // Run weights reshaping (Runs once for every configure) if(_is_first_run) { - _reshape_weights.run(); + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _reshape_weights.run(); _is_first_run = false; + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); } _memory_group.acquire(); diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp index a3eb5010bd..986fe00973 100644 --- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp @@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_first_run(false) + _is_first_run(false), _original_weights(nullptr) { } @@ -126,8 +126,9 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info)); - bool _has_bias = (biases != nullptr); - _is_first_run = true; + bool _has_bias = (biases != nullptr); + _original_weights = weights; + _is_first_run = true; const unsigned int kernel_width = weights->info()->dimension(0); const unsigned int kernel_height = weights->info()->dimension(1); @@ -169,8 +170,13 @@ void CLLocallyConnectedLayer::run() // Run weights reshaping (Runs once for every configure) if(_is_first_run) { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _is_first_run = false; CLScheduler::get().enqueue(_weights_reshape_kernel); + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); } _memory_group.acquire(); diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 95fcf8805e..f28ed715f6 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -153,7 +153,7 @@ void NEDepthwiseConvolutionLayer3x3::run() NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer() : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), - _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false) + _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr) { } @@ -167,7 +167,9 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh const size_t weights_h = weights->info()->dimension(1); const size_t weights_z = weights->info()->dimension(2); - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _is_first_run = true; + _original_weights = weights; // Should bias be appended ? bool append_bias = (biases != nullptr) && !_is_quantized; @@ -241,10 +243,21 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh void NEDepthwiseConvolutionLayer::run() { + // Run weights reshaping (Runs once for every configure) + if(_is_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); + NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX); + _is_first_run = false; + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); + } + NEScheduler::get().schedule(&_im2col_kernel, Window::DimX); - NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX); - NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX); NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX); NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX); if(_is_quantized) diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 26b7271710..b310ad35e3 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -132,7 +132,7 @@ void NEFullyConnectedLayerReshapeWeights::run() NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), - _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false) + _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr) { } @@ -163,6 +163,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions; const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions); + _original_weights = weights; _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1); _are_weights_reshaped = are_weights_reshaped; _accumulate_biases = biases != nullptr; @@ -324,8 +325,13 @@ void NEFullyConnectedLayer::run() // Reshape of the weights (happens only once) if(!_are_weights_reshaped) { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _are_weights_reshaped = true; _reshape_weights_kernel.run(); + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); } _memory_group.acquire(); diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index d9707d95e0..b2dd0227a5 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -217,7 +217,7 @@ Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInf NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager) : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), - _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false), + _output_col2im_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false) { } @@ -267,6 +267,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig ARM_COMPUTE_ERROR_THROW_ON(status); + _original_weights = weights; const unsigned int fixed_point_position = input->info()->fixed_point_position(); const ITensor *biases_to_use = (_append_bias) ? biases : nullptr; @@ -549,8 +550,13 @@ void NEGEMMConvolutionLayer::run() // Run weights reshaping (Runs once for every configure) if(!_are_weights_reshaped) { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _are_weights_reshaped = true; _reshape_weights.run(); + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); } _memory_group.acquire(); diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp index 973559441f..913acf86a2 100644 --- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp @@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_first_run(false) + _is_first_run(false), _original_weights(nullptr) { } @@ -126,8 +126,9 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info)); - bool _has_bias = (biases != nullptr); - _is_first_run = true; + bool _has_bias = (biases != nullptr); + _is_first_run = true; + _original_weights = weights; const unsigned int kernel_width = weights->info()->dimension(0); const unsigned int kernel_height = weights->info()->dimension(1); @@ -169,8 +170,13 @@ void NELocallyConnectedLayer::run() // Run weights reshaping (Runs once for every configure) if(_is_first_run) { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _is_first_run = false; NEScheduler::get().schedule(&_weights_reshape_kernel, 3); + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); } _memory_group.acquire(); -- cgit v1.2.1