From d7316eb877cc4ff8573219374335e917b19a0203 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Wed, 16 Jun 2021 11:14:41 +0100
Subject: Port NEGEMMConv2d to memory injecting interface

Resolves: COMPMID-4506, COMPMID-4570

Change-Id: I6d37a06da141f1fcfcaa8525322a319cb0234791
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5824
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/runtime/NEON/functions/NEGEMM.h        |  15 +-
 arm_compute/runtime/NEON/functions/NEGEMMConv2d.h  |   5 +-
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  |  63 +---
 .../NEON/functions/NEWinogradConvolutionLayer.h    |  15 +-
 src/core/helpers/MemoryHelpers.h                   |   2 +-
 src/runtime/CL/functions/CLGEMM.cpp                |  14 +-
 src/runtime/NEON/functions/NEGEMM.cpp              |  53 +++-
 src/runtime/NEON/functions/NEGEMMConv2d.cpp        |  74 ++++-
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     | 350 ++++++++++++---------
 .../NEON/functions/NEWinogradConvolutionLayer.cpp  |   5 -
 src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp  |  77 +++--
 src/runtime/cpu/operators/CpuGemmDirectConv2d.h    |  40 ++-
 .../operators/internal/CpuGemmAssemblyDispatch.cpp | 265 +++++-----------
 .../operators/internal/CpuGemmAssemblyDispatch.h   |  20 +-
 tests/validation/NEON/ConvolutionLayer.cpp         |  95 ++++++
 15 files changed, 560 insertions(+), 533 deletions(-)
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 6fa30bd545..6c5be0eb5e 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -32,6 +32,7 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/helpers/MemoryHelpers.h"
 
 #include <memory>
 
@@ -105,14 +106,7 @@ public:
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
      *
-     * @param[in]  a         First input tensor info  (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32
-     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor info  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor info. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     * Similar to @ref NEGEMM::configure()
      *
      * @return a status
      */
@@ -146,7 +140,10 @@ private:
     bool           _reshape_b_only_on_first_run;
     bool           _is_prepared;
 
-    ITensorPack _asm_glue_tensors{};
+    ITensorPack                      _asm_glue_run_pack;
+    ITensorPack                      _asm_glue_prep_pack;
+    WorkspaceData<Tensor>            _asm_glue_workspace;
+    experimental::MemoryRequirements _aux_mem_req;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index f39ce4dfa3..53ceb6d978 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -29,15 +29,12 @@
 #include "arm_compute/runtime/IMemoryManager.h"
 
 #include <memory>
+
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
 class ITensorInfo;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
 
 /** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index dc9783f9eb..ff888760e1 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -24,32 +24,15 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
-#include "NEActivationLayer.h"
-#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
 class ITensor;
-class NEConvertQuantizedSignednessKernel;
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMLowpMatrixMultiplyKernel;
-class NEGEMMLowpOffsetContributionKernel;
-class NEGEMMLowpOffsetContributionOutputStageKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-class NEGEMMLowpMatrixBReductionKernel;
-class NEGEMMTranspose1xWKernel;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
-
 /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
  *
  *  -# @ref NEGEMMInterleave4x4Kernel
@@ -119,14 +102,7 @@ public:
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
-     *
-     * @param[in] a         First input tensor info  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[in] output    Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                      if the reshape of matrix B should be executed only for the first run
+     * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
@@ -137,41 +113,8 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                                                    _memory_group;
-    IWeightsManager                                               *_weights_manager;
-    std::unique_ptr<cpu::CpuGemmAssemblyDispatch>                  _asm_glue;
-    std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel>                _mm_kernel;
-    std::unique_ptr<NEGEMMInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
-    std::unique_ptr<NEGEMMTranspose1xWKernel>                      _mtx_b_reshape_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
-    NEActivationLayer                                              _activation_func;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
-
-    Tensor         _vector_sum_col;
-    Tensor         _vector_sum_row;
-    Tensor         _tmp_a;
-    Tensor         _tmp_b;
-    Tensor         _mm_result_s32;
-    Tensor         _signed_a;
-    Tensor         _signed_output;
-    const ITensor *_original_b;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-
-    bool _run_vector_matrix_multiplication;
-    bool _assembly_path;
-    bool _fused_assembly_path;
-    bool _reshape_b_only_on_first_run;
-    bool _is_prepared;
-    bool _fuse_output_stage;
-    bool _run_activation;
-    bool _flip_signedness;
-
-    ITensorPack _asm_glue_tensors{};
+    struct Impl;
+    std::unique_ptr<struct Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index f9ebf608cb..b02c4ed5b7 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -96,20 +96,9 @@ public:
     void run() override;
     void prepare() override;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
+    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer
      *
-     * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     *                             Currently only 3x3 and 5x5 kernels are supported.
-     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p input.
-     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
+     * Similar to @ref NEWinogradConvolutionLayer::configure()
      *
      * @return a status
      */
diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index e751e6025d..619a4ec122 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h
@@ -70,7 +70,7 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 
         auto aux_tensor = workspace_memory.back().second.get();
         ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor);
-        aux_tensor->allocator()->init(aux_info);
+        aux_tensor->allocator()->init(aux_info, req.alignment);
 
         if(req.lifetime == experimental::MemoryLifetime::Temporary)
         {
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 35126ec0d7..14b0633e09 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -41,14 +41,10 @@ using OperatorType = opencl::ClGemm;
 
 struct CLGEMM::Impl
 {
-    const ICLTensor              *a{ nullptr };
     const ICLTensor              *b{ nullptr };
-    const ICLTensor              *c{ nullptr };
-    ICLTensor                    *dst{ nullptr };
     std::unique_ptr<OperatorType> op{ nullptr };
     MemoryGroup                   memory_group{};
     IWeightsManager              *weights_manager{ nullptr };
-    CLTensor                      weights_transformed{};
     ITensorPack                   run_pack{};
     ITensorPack                   prep_pack{};
     MemoryRequirements            aux_mem_req{};
@@ -74,10 +70,7 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
-    _impl->a           = a;
     _impl->b           = b;
-    _impl->c           = c;
-    _impl->dst         = output;
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
@@ -87,12 +80,12 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor
     // Manage/allocate auxilairy tensors
     if(_impl->is_prepared)
     {
-        _impl->run_pack.add_const_tensor(ACL_SRC_0, _impl->a);
-        _impl->run_pack.add_tensor(ACL_DST, _impl->dst);
+        _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
+        _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack  = { { ACL_SRC_0, _impl->a }, { ACL_SRC_2, _impl->c }, { ACL_DST, _impl->dst } };
+        _impl->run_pack  = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } };
         _impl->prep_pack = { { ACL_SRC_1, _impl->b } };
 
         _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
@@ -110,7 +103,6 @@ void CLGEMM::run()
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->a, _impl->b, _impl->dst);
     _impl->op->run(_impl->run_pack);
 }
 
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 7318c3e492..9b14052c75 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -38,10 +38,12 @@
 #include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include <cmath>
 
+using namespace arm_compute::experimental;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
@@ -61,9 +63,31 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 } // namespace
 
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>()), _ma_kernel(),
-      _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
-      _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+    : _memory_group(memory_manager),
+      _weights_manager(weights_manager),
+      _interleave_kernel(),
+      _transpose_kernel(),
+      _mm_kernel(),
+      _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>()),
+      _ma_kernel(),
+      _alpha_scale_func(nullptr),
+      _add_bias(),
+      _activation_func(),
+      _tmp_a(),
+      _tmp_b(),
+      _tmp_d(),
+      _original_b(nullptr),
+      _run_vector_matrix_multiplication(false),
+      _run_alpha_scale(false),
+      _run_addition(false),
+      _run_bias_addition(false),
+      _run_activation(false),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false),
+      _asm_glue_run_pack(),
+      _asm_glue_prep_pack(),
+      _asm_glue_workspace(),
+      _aux_mem_req()
 {
 }
 
@@ -94,13 +118,16 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
         _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
 
-        _asm_glue_tensors =
+        _aux_mem_req = _asm_glue->workspace();
+        _asm_glue_run_pack =
         {
             { ACL_SRC_0, a },
             { ACL_SRC_1, b },
             { ACL_SRC_2, c_to_use },
             { ACL_DST, d },
         };
+        _asm_glue_prep_pack = { { ACL_SRC_1, b }, { ACL_SRC_2, c_to_use } };
+        _asm_glue_workspace = manage_workspace<Tensor>(_aux_mem_req, _memory_group, _asm_glue_run_pack, _asm_glue_prep_pack);
 
         // Scale product by alpha
         if(_run_alpha_scale)
@@ -323,7 +350,7 @@ void NEGEMM::run()
 
     if(_asm_glue->is_configured())
     {
-        _asm_glue->run(_asm_glue_tensors);
+        _asm_glue->run(_asm_glue_run_pack);
         if(_run_alpha_scale)
         {
             _alpha_scale_func.run();
@@ -372,16 +399,20 @@ void NEGEMM::prepare()
         const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
         if(_asm_glue->is_configured())
         {
-            if(!original_b_managed_by_weights_manager)
-            {
-                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-            }
+            _asm_glue->prepare(_asm_glue_prep_pack);
 
-            _asm_glue->prepare(_asm_glue_tensors);
-            if(!original_b_managed_by_weights_manager)
+            auto has_reshape = std::find_if(_aux_mem_req.begin(),
+                                            _aux_mem_req.end(),
+                                            [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+            if(has_reshape != std::end(_aux_mem_req))
             {
                 _original_b->mark_as_unused();
             }
+            else
+            {
+                _asm_glue_run_pack.add_const_tensor(ACL_SRC_1, _original_b);
+            }
         }
         else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
         {
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 564ce2f514..3ca5239ae3 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -24,50 +24,98 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
 
-#include <set>
-
 namespace arm_compute
 {
 using OperatorType = cpu::CpuGemmDirectConv2d;
+using namespace arm_compute::experimental;
 
 struct NEGEMMConv2d::Impl
 {
-    ITensorPack                   tensors{};
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ITensor                   *weights{ nullptr };
+    std::unique_ptr<OperatorType>    op{ nullptr };
+    ITensorPack                      run_pack{};
+    ITensorPack                      prep_pack{};
+    WorkspaceData<Tensor>            workspace{};
+    MemoryGroup                      memory_group{};
+    bool                             is_prepared{ false };
+    experimental::MemoryRequirements aux_mem_req{};
 };
 
 NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _impl(std::make_unique<Impl>())
 {
-    _impl->op = std::make_unique<OperatorType>(memory_manager);
+    _impl->memory_group = MemoryGroup(memory_manager);
 }
 
 NEGEMMConv2d::~NEGEMMConv2d() = default;
 
 void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
 {
-    _impl->tensors.add_const_tensor(TensorType::ACL_SRC_0, input);
-    _impl->tensors.add_const_tensor(TensorType::ACL_SRC_1, weights);
-    _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases);
-    _impl->tensors.add_tensor(TensorType::ACL_DST, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    _impl->weights     = weights;
+    _impl->is_prepared = false;
+    _impl->op          = std::make_unique<OperatorType>();
 
     _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info);
+
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->run_pack    = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } };
+    _impl->prep_pack   = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } };
+    _impl->workspace   = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
 Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
 {
     return OperatorType::validate(input, weights, biases, output, info);
 }
+
 void NEGEMMConv2d::run()
 {
-    _impl->op->run(_impl->tensors);
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
+
 void NEGEMMConv2d::prepare()
 {
-    _impl->op->prepare(_impl->tensors);
+    if(!_impl->is_prepared)
+    {
+        _impl->op->prepare(_impl->prep_pack);
+
+        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
+                                        _impl->aux_mem_req.end(),
+                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+        if(has_reshape != std::end(_impl->aux_mem_req))
+        {
+            _impl->weights->mark_as_unused();
+        }
+        else
+        {
+            _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights);
+        }
+
+        // Release temporary tensors that are only used in prepare stage
+        for(auto &ws : _impl->workspace)
+        {
+            const int slot = ws.first;
+            for(auto &m : _impl->aux_mem_req)
+            {
+                if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
+                {
+                    auto tensor = ws.second.get();
+                    tensor->allocator()->free();
+                    break;
+                }
+            }
+        }
+        _impl->is_prepared = true;
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index cc0f20e695..224fb1eb56 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -32,9 +32,14 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
 
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
 #include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
@@ -61,17 +66,58 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 }
 } // namespace
 
+struct NEGEMMLowpMatrixMultiplyCore::Impl
+{
+    MemoryGroup                                                    memory_group{};
+    IWeightsManager                                               *weights_manager{ nullptr };
+    std::unique_ptr<cpu::CpuGemmAssemblyDispatch>                  asm_glue{ nullptr };
+    std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel>                mm_kernel{ nullptr };
+    std::unique_ptr<NEGEMMInterleave4x4Kernel>                     mtx_a_reshape_kernel{ nullptr };
+    std::unique_ptr<NEGEMMTranspose1xWKernel>                      mtx_b_reshape_kernel{ nullptr };
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel>              mtx_a_reduction_kernel{ nullptr };
+    std::unique_ptr<NEGEMMLowpMatrixBReductionKernel>              mtx_b_reduction_kernel{ nullptr };
+    std::unique_ptr<NEGEMMLowpOffsetContributionKernel>            offset_contribution_kernel{ nullptr };
+    std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> offset_contribution_output_stage_kernel{ nullptr };
+    std::unique_ptr<NEActivationLayer>                             activation_func{ nullptr };
+    std::unique_ptr<NEConvertQuantizedSignednessKernel>            convert_to_signed_asymm{ nullptr };
+    std::unique_ptr<NEConvertQuantizedSignednessKernel>            convert_from_signed_asymm{ nullptr };
+
+    Tensor         vector_sum_col{};
+    Tensor         vector_sum_row{};
+    Tensor         tmp_a{};
+    Tensor         tmp_b{};
+    Tensor         mm_result_s32{};
+    Tensor         signed_a{};
+    Tensor         signed_output{};
+    const ITensor *original_b{ nullptr };
+    int32_t        a_offset{ 0 };
+    int32_t        b_offset{ 0 };
+
+    bool run_vector_matrix_multiplication{ false };
+    bool assembly_path{ false };
+    bool fused_assembly_path{ false };
+    bool reshape_b_only_on_first_run{ false };
+    bool is_prepared{ false };
+    bool fuse_output_stage{ false };
+    bool run_activation{ false };
+    bool flip_signedness{ false };
+
+    experimental::MemoryRequirements aux_mem_req{};
+    ITensorPack                      asm_glue_run_pack{};
+    ITensorPack                      asm_glue_prep_pack{};
+    WorkspaceData<Tensor>            asm_glue_workspace{};
+};
+
+using namespace arm_compute::experimental;
 using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(),
-      _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(),
-      _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
-      _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
-      _run_activation(false), _flip_signedness(false)
+    : _impl(std::make_unique<struct NEGEMMLowpMatrixMultiplyCore::Impl>())
 {
+    _impl->memory_group    = MemoryGroup(memory_manager);
+    _impl->weights_manager = weights_manager;
 }
 
 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
@@ -85,53 +131,55 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     GEMMInfo       info     = gemm_info;
 
     // Set internal variables
-    _a_offset                         = a->info()->quantization_info().uniform().offset;
-    _b_offset                         = b->info()->quantization_info().uniform().offset;
-    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-    _reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
-    _is_prepared                      = false;
-    _fused_assembly_path              = false;
-    _flip_signedness                  = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
-    _original_b                       = b;
+    _impl->a_offset                         = a->info()->quantization_info().uniform().offset;
+    _impl->b_offset                         = b->info()->quantization_info().uniform().offset;
+    _impl->run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+    _impl->reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
+    _impl->is_prepared                      = false;
+    _impl->fused_assembly_path              = false;
+    _impl->flip_signedness                  = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _impl->reshape_b_only_on_first_run;
+    _impl->original_b                       = b;
+
+    _impl->asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
 
     const ITensor *a_to_use = a;
 
     // Convert to QASYMM8 -> QASYMM8_SIGNED and back
-    if(_flip_signedness)
+    if(_impl->flip_signedness)
     {
         const int32_t                 offset_correction = 128;
         const DataType                dt                = DataType::QASYMM8_SIGNED;
         const UniformQuantizationInfo iqinfo            = a_to_use->info()->quantization_info().uniform();
 
-        _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
-        _memory_group.manage(&_signed_a);
-        _convert_to_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
-        _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
-        a_to_use  = &_signed_a;
-        _a_offset = _signed_a.info()->quantization_info().uniform().offset;
+        _impl->signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
+        _impl->memory_group.manage(&_impl->signed_a);
+        _impl->convert_to_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
+        _impl->convert_to_signed_asymm->configure(a_to_use, &_impl->signed_a);
+        a_to_use        = &_impl->signed_a;
+        _impl->a_offset = _impl->signed_a.info()->quantization_info().uniform().offset;
 
         const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
-        _memory_group.manage(&_signed_output);
-        _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
+        _impl->memory_group.manage(&_impl->signed_output);
+        _impl->signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
 
         // Output stage correction
         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
-        output_stage_corr.gemmlowp_offset         = _signed_output.info()->quantization_info().uniform().offset;
+        output_stage_corr.gemmlowp_offset         = _impl->signed_output.info()->quantization_info().uniform().offset;
         output_stage_corr.gemmlowp_min_bound -= offset_correction;
         output_stage_corr.gemmlowp_max_bound -= offset_correction;
         info.set_gemmlowp_output_stage(output_stage_corr);
 
         // Update matrix a
-        matrix_a = &_signed_a;
+        matrix_a = &_impl->signed_a;
     }
 
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
     if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
-        _fuse_output_stage = true;
-        _memory_group.manage(&_mm_result_s32);
+        _impl->fuse_output_stage = true;
+        _impl->memory_group.manage(&_impl->mm_result_s32);
         TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
-        _mm_result_s32.allocator()->init(info_mm_result_s32);
+        _impl->mm_result_s32.allocator()->init(info_mm_result_s32);
     }
 
     // Initialize assembly kernel meta-data
@@ -147,20 +195,28 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
                 auto c_info_to_use = c == nullptr ? nullptr : c->info();
-                _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info);
-                _fused_assembly_path = _asm_glue->is_configured();
-                _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
-                _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output);
+                _impl->asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info);
+                _impl->fused_assembly_path = _impl->asm_glue->is_configured();
+                _impl->asm_glue_run_pack.add_const_tensor(TensorType::ACL_SRC_2, c);
+                _impl->asm_glue_run_pack.add_tensor(TensorType::ACL_DST, output);
             }
             else
             {
-                auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output);
-                _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info);
-                _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
+                auto output_to_use = (_impl->fuse_output_stage ? &_impl->mm_result_s32 : output);
+                _impl->asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info);
+                _impl->asm_glue_run_pack.add_tensor(TensorType::ACL_DST, output_to_use);
+            }
+            _impl->assembly_path = _impl->asm_glue->is_configured();
+
+            if(_impl->assembly_path)
+            {
+                _impl->asm_glue_run_pack.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+
+                _impl->aux_mem_req        = _impl->asm_glue->workspace();
+                _impl->asm_glue_prep_pack = { { TensorType::ACL_SRC_1, b }, { TensorType::ACL_SRC_2, c } };
+
+                _impl->asm_glue_workspace = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->asm_glue_run_pack, _impl->asm_glue_prep_pack);
             }
-            _assembly_path = _asm_glue->is_configured();
-            _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
-            _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
             break;
         }
         default:
@@ -170,142 +226,144 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         }
     }
 #endif /* __aarch64__ */
-    if(!(_assembly_path || _run_vector_matrix_multiplication))
+    if(!(_impl->assembly_path || _impl->run_vector_matrix_multiplication))
     {
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
+        matrix_a = &_impl->tmp_a;
+        matrix_b = &_impl->tmp_b;
 
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
         TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
         TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
-        _tmp_a.allocator()->init(a_info);
-        _tmp_b.allocator()->init(b_info);
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
+        _impl->tmp_a.allocator()->init(a_info);
+        _impl->tmp_b.allocator()->init(b_info);
+        _impl->memory_group.manage(&_impl->tmp_a);
+        if(!_impl->reshape_b_only_on_first_run)
         {
-            _memory_group.manage(&_tmp_b);
+            _impl->memory_group.manage(&_impl->tmp_b);
         }
 
         // Configure interleave kernel
-        _mtx_a_reshape_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>();
-        _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
+        _impl->mtx_a_reshape_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>();
+        _impl->mtx_a_reshape_kernel->configure(a_to_use, &_impl->tmp_a);
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel = std::make_unique<NEGEMMTranspose1xWKernel>();
-        _mtx_b_reshape_kernel->configure(b, &_tmp_b);
+        _impl->mtx_b_reshape_kernel = std::make_unique<NEGEMMTranspose1xWKernel>();
+        _impl->mtx_b_reshape_kernel->configure(b, &_impl->tmp_b);
     }
 
-    if(!_fused_assembly_path)
+    if(!_impl->fused_assembly_path)
     {
         // Build reduction info
         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
 
-        // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
+        // Initialize matrix B reduction kernel only if _impl->a_offset is not equal to 0
+        if(_impl->a_offset != 0)
         {
             TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
 
-            _vector_sum_col.allocator()->init(info_vector_sum_col);
-            if(!_reshape_b_only_on_first_run)
+            _impl->vector_sum_col.allocator()->init(info_vector_sum_col);
+            if(!_impl->reshape_b_only_on_first_run)
             {
-                _memory_group.manage(&_vector_sum_col);
+                _impl->memory_group.manage(&_impl->vector_sum_col);
             }
 
             // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel = std::make_unique<NEGEMMLowpMatrixBReductionKernel>();
-            _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
+            _impl->mtx_b_reduction_kernel = std::make_unique<NEGEMMLowpMatrixBReductionKernel>();
+            _impl->mtx_b_reduction_kernel->configure(b, &_impl->vector_sum_col, reduction_info);
         }
 
-        // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        // Initialize Matrix A reduction kernel only if _impl->b_offset is not equal to 0
+        if(_impl->b_offset != 0)
         {
             TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
 
-            _vector_sum_row.allocator()->init(info_vector_sum_row);
-            _memory_group.manage(&_vector_sum_row);
+            _impl->vector_sum_row.allocator()->init(info_vector_sum_row);
+            _impl->memory_group.manage(&_impl->vector_sum_row);
 
             // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-            _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
+            _impl->mtx_a_reduction_kernel = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+            _impl->mtx_a_reduction_kernel->configure(a_to_use, &_impl->vector_sum_row, reduction_info);
         }
 
-        if(_fuse_output_stage)
+        if(_impl->fuse_output_stage)
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if(!_impl->assembly_path)
             {
-                _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-                _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
+                _impl->mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _impl->mm_kernel->configure(matrix_a, matrix_b, &_impl->mm_result_s32);
             }
 
-            _offset_contribution_output_stage_kernel = std::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
-            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
-                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                                _flip_signedness ? &_signed_output : output,
-                                                                a->info()->dimension(0),
-                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
+            _impl->offset_contribution_output_stage_kernel = std::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
+            _impl->offset_contribution_output_stage_kernel->configure(&_impl->mm_result_s32,
+                                                                      _impl->a_offset == 0 ? nullptr : &_impl->vector_sum_col,
+                                                                      _impl->b_offset == 0 ? nullptr : &_impl->vector_sum_row, c,
+                                                                      _impl->flip_signedness ? &_impl->signed_output : output,
+                                                                      a->info()->dimension(0),
+                                                                      _impl->a_offset, _impl->b_offset, info.gemmlowp_output_stage());
 
-            if(_flip_signedness)
+            if(_impl->flip_signedness)
             {
-                _convert_from_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
-                _convert_from_signed_asymm->configure(&_signed_output, output);
+                _impl->convert_from_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
+                _impl->convert_from_signed_asymm->configure(&_impl->signed_output, output);
             }
         }
         else
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if(!_impl->assembly_path)
             {
-                _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-                _mm_kernel->configure(matrix_a, matrix_b, output);
+                _impl->mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _impl->mm_kernel->configure(matrix_a, matrix_b, output);
             }
             // Configure offset contribution kernel
-            _offset_contribution_kernel = std::make_unique<NEGEMMLowpOffsetContributionKernel>();
-            _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
+            _impl->offset_contribution_kernel = std::make_unique<NEGEMMLowpOffsetContributionKernel>();
+            _impl->offset_contribution_kernel->configure(output, _impl->a_offset == 0 ? nullptr : &_impl->vector_sum_col, _impl->b_offset == 0 ? nullptr : &_impl->vector_sum_row, a_to_use->info()->dimension(0),
+                                                         _impl->a_offset, _impl->b_offset);
         }
     }
     // Configure activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    _run_activation                       = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
-    if(_run_activation)
+    _impl->run_activation                 = activation.enabled() && (!_impl->assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+    if(_impl->run_activation)
     {
-        _activation_func.configure(output, nullptr, activation);
+        _impl->activation_func = std::make_unique<NEActivationLayer>();
+        _impl->activation_func->configure(output, nullptr, activation);
     }
 
     // Allocate tensors
-    if(!_assembly_path && !_run_vector_matrix_multiplication)
+    if(!_impl->assembly_path && !_impl->run_vector_matrix_multiplication)
     {
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
+        _impl->tmp_a.allocator()->allocate();
+        if(!_impl->reshape_b_only_on_first_run)
         {
-            _tmp_b.allocator()->allocate();
+            _impl->tmp_b.allocator()->allocate();
         }
     }
 
-    if(!_fused_assembly_path)
+    if(!_impl->fused_assembly_path)
     {
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+        if(_impl->a_offset != 0 && !_impl->reshape_b_only_on_first_run)
         {
-            _vector_sum_col.allocator()->allocate();
+            _impl->vector_sum_col.allocator()->allocate();
         }
 
-        if(_b_offset != 0)
+        if(_impl->b_offset != 0)
         {
-            _vector_sum_row.allocator()->allocate();
+            _impl->vector_sum_row.allocator()->allocate();
         }
     }
 
-    if(_fuse_output_stage)
+    if(_impl->fuse_output_stage)
     {
-        _mm_result_s32.allocator()->allocate();
+        _impl->mm_result_s32.allocator()->allocate();
     }
 
-    if(_flip_signedness)
+    if(_impl->flip_signedness)
     {
-        _signed_a.allocator()->allocate();
-        _signed_output.allocator()->allocate();
+        _impl->signed_a.allocator()->allocate();
+        _impl->signed_output.allocator()->allocate();
     }
 }
 
@@ -509,118 +567,112 @@ void NEGEMMLowpMatrixMultiplyCore::run()
 {
     prepare();
 
-    MemoryGroupResourceScope scope_mg(_memory_group);
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
     // Convert QASYMM8->QASYMM8_SIGNED
-    if(_flip_signedness)
+    if(_impl->flip_signedness)
     {
-        NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
+        NEScheduler::get().schedule(_impl->convert_to_signed_asymm.get(), Window::DimY);
     }
 
     // Run GEMM
-    if(_asm_glue->is_configured())
+    if(_impl->asm_glue->is_configured())
     {
-        _asm_glue->run(_asm_glue_tensors);
+        _impl->asm_glue->run(_impl->asm_glue_run_pack);
     }
     else
     {
-        if(!_run_vector_matrix_multiplication)
+        if(!_impl->run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+            NEScheduler::get().schedule(_impl->mtx_a_reshape_kernel.get(), Window::DimY);
 
-            if(!_reshape_b_only_on_first_run)
+            if(!_impl->reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+                NEScheduler::get().schedule(_impl->mtx_b_reshape_kernel.get(), Window::DimY);
             }
         }
-        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+        NEScheduler::get().schedule(_impl->mm_kernel.get(), Window::DimY);
     }
 
-    if(!_fused_assembly_path)
+    if(!_impl->fused_assembly_path)
     {
-        // Run matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        // Run matrix A reduction kernel only if _impl->b_offset is not equal to 0
+        if(_impl->b_offset != 0)
         {
-            NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
+            NEScheduler::get().schedule(_impl->mtx_a_reduction_kernel.get(), Window::DimX);
         }
 
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+        // Run matrix B reduction kernel only if _impl->a_offset is not equal to 0
+        if(_impl->a_offset != 0 && !_impl->reshape_b_only_on_first_run)
         {
-            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
+            NEScheduler::get().schedule(_impl->mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
-        if(_fuse_output_stage)
+        if(_impl->fuse_output_stage)
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
+            NEScheduler::get().schedule(_impl->offset_contribution_output_stage_kernel.get(), Window::DimY);
         }
         else
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
+            NEScheduler::get().schedule(_impl->offset_contribution_kernel.get(), Window::DimY);
         }
     }
 
     // Convert QASYMM8_SIGNED->QASYMM8
-    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+    if(!_impl->fused_assembly_path && _impl->fuse_output_stage && _impl->flip_signedness)
     {
-        NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
+        NEScheduler::get().schedule(_impl->convert_from_signed_asymm.get(), Window::DimY);
     }
 
     // Run fused activation unless already run in the fused assembly
-    if(_run_activation)
+    if(_impl->run_activation)
     {
-        _activation_func.run();
+        _impl->activation_func->run();
     }
 }
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_is_prepared)
+    if(!_impl->is_prepared)
     {
-        const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
         // Run assembly reshape
-        if(_asm_glue->is_configured())
+        if(_impl->asm_glue->is_configured())
         {
-            if(!original_b_managed_by_weights_manager)
+            _impl->asm_glue->prepare(_impl->asm_glue_prep_pack);
+
+            auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
+                                            _impl->aux_mem_req.end(),
+                                            [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+            if(has_reshape != std::end(_impl->aux_mem_req))
             {
-                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+                _impl->original_b->mark_as_unused();
             }
-
-            _asm_glue->prepare(_asm_glue_tensors);
-            if(!original_b_managed_by_weights_manager)
+            else
             {
-                _original_b->mark_as_unused();
+                _impl->asm_glue_run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b);
             }
         }
         // Run non-assembly reshape
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+        else if(_impl->reshape_b_only_on_first_run && !_impl->run_vector_matrix_multiplication && !_impl->asm_glue->is_configured())
         {
-            if(!original_b_managed_by_weights_manager)
-            {
-                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-            }
-
             // Run reshape kernel and mark original weights tensor as unused
-            _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-            if(!original_b_managed_by_weights_manager)
-            {
-                _original_b->mark_as_unused();
-            }
+            _impl->tmp_b.allocator()->allocate();
+            NEScheduler::get().schedule(_impl->mtx_b_reshape_kernel.get(), Window::DimY);
         }
 
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+        // Run matrix B reduction kernel only if _impl->a_offset is not equal to 0
+        if(!_impl->fused_assembly_path && _impl->a_offset != 0 && _impl->reshape_b_only_on_first_run)
         {
-            _vector_sum_col.allocator()->allocate();
-            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
+            _impl->vector_sum_col.allocator()->allocate();
+            NEScheduler::get().schedule(_impl->mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
-        _is_prepared = true;
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 0bf1738bec..57950d5126 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -29,12 +29,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
index e50099df1f..c2e9f24ff6 100644
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
+
+#include "support/Cast.h"
 
 #include <set>
 
@@ -37,6 +37,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+
 namespace
 {
 GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
@@ -87,12 +90,14 @@ cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect
 }
 } // namespace
 
-CpuGemmDirectConv2d::CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>(memory_manager)),
+CpuGemmDirectConv2d::CpuGemmDirectConv2d()
+    : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>()),
       _activation_func(std::make_unique<CpuActivation>()),
       _weights_permute_func(std::make_unique<CpuPermute>()),
-      _permuted_weights_info(),
-      _permuted_weights(std::make_unique<Tensor>())
+      _aux_mem(AuxTensorIdx::Count),
+      _perm_weights(),
+      _run_activation(false),
+      _is_prepared(false)
 {
 }
 
@@ -106,8 +111,10 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w
                                                              biases != nullptr ? biases : nullptr,
                                                              dst,
                                                              info));
-    _original_weights_info = weights;
-    _weights_permute_func->configure(weights, &_permuted_weights_info, PermutationVector{ 3, 0, 1, 2 });
+    _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
+    _is_prepared    = false;
+
+    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
 
     // Configure assembly dispatch
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
@@ -115,13 +122,27 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w
     {
         asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
     }
-    _gemm_asm_func->configure(src, &_permuted_weights_info, biases, dst, asm_info);
+    _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
 
     // Configure activation
-    if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info))
+    if(_run_activation)
     {
         _activation_func->configure(dst, nullptr, info.act_info);
-        _run_activation = true;
+    }
+
+    // Add auxiliary memory requirements of the assembly dispatch
+    auto asm_mem_req           = _gemm_asm_func->workspace();
+    _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
+    _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
+
+    if(_aux_mem[Pretranspose].size > 0)
+    {
+        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+    }
+    else
+    {
+        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
     }
 }
 Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
@@ -172,35 +193,29 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors)
     }
 }
 
-void CpuGemmDirectConv2d::allocate_permuted_weights()
-{
-    // TODO: This function will be removed when memory injection is implemeted.
-    ARM_COMPUTE_ERROR_ON(_permuted_weights == nullptr);
-    _permuted_weights->allocator()->free();
-    _permuted_weights->allocator()->init(_permuted_weights_info);
-    _permuted_weights->allocator()->allocate();
-}
-
 void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
 {
     if(!_is_prepared)
     {
-        allocate_permuted_weights();
-        ITensorPack permute_tensors
-        {
-            { TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
-            { TensorType::ACL_DST, _permuted_weights.get() },
-        };
+        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
 
+        CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
+        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
         _weights_permute_func->run(permute_tensors);
 
-        tensors.get_const_tensor(TensorType::ACL_SRC_1)->mark_as_unused();
+        tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
+        // Call prepare of assembly dispatch
+        _gemm_asm_func->prepare(tensors);
 
-        // switch the original tensor with permuted tensor
-        tensors.add_const_tensor(TensorType::ACL_SRC_1, _permuted_weights.get());
         _is_prepared = true;
     }
 }
 
+experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const
+{
+    return _aux_mem;
+}
 } // namespace cpu
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
index 6aa17c2349..b572f36a3a 100644
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
@@ -24,14 +24,12 @@
 #ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
 #define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
 
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
 #include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 namespace arm_compute
 {
@@ -40,15 +38,11 @@ class ITensor;
 struct Conv2dInfo;
 namespace cpu
 {
-class CpuGemmAssemblyDispatch;
-class CpuActivation;
-class CpuPermute;
-
 class CpuGemmDirectConv2d : public ICpuOperator
 {
 public:
     /** Constructor */
-    CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    CpuGemmDirectConv2d();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d);
     /** Destructor */
     ~CpuGemmDirectConv2d();
@@ -89,22 +83,24 @@ public:
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
     void prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
 
 private:
+    enum AuxTensorIdx
+    {
+        AsmGemmWorkspace = 0,
+        Pretranspose,
+        PermutedWeights,
+        Count
+    };
+
     std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func;
     std::unique_ptr<CpuActivation>           _activation_func;
     std::unique_ptr<CpuPermute>              _weights_permute_func;
-    const ITensorInfo                       *_original_weights_info{};
-    TensorInfo                               _permuted_weights_info;
-    std::unique_ptr<Tensor>                  _permuted_weights{ nullptr };
-    bool                                     _is_prepared{ false };
-    bool                                     _run_activation{ false };
-
-    /** Function to allocated a tensor for permuted weights
-     *
-     * @note This function will be removed when memory injection is properly implemented.
-     */
-    void allocate_permuted_weights();
+    experimental::MemoryRequirements         _aux_mem;
+    TensorInfo                               _perm_weights;
+    bool                                     _run_activation;
+    bool                                     _is_prepared;
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 1101e05a0d..79ea1cb5a7 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -27,15 +27,18 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/AssemblyUtils.h"
+#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
 
 #include <arm_neon.h>
-#include <cstdlib>
 
 namespace arm_compute
 {
 namespace cpu
 {
+using namespace arm_compute::experimental;
+
 namespace
 {
 struct free_delete
@@ -113,103 +116,27 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp
     return scheduling_hint;
 }
 
-template <typename TypeInput, typename TypeOutput>
-class FallbackTransform : public ITransformWeights
-{
-public:
-    FallbackTransform() noexcept {};
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    FallbackTransform(const FallbackTransform &) = delete;
-    /** Default move constructor */
-    FallbackTransform(FallbackTransform &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    FallbackTransform &operator=(const FallbackTransform &) = delete;
-    /** Default move assignment operator */
-    FallbackTransform &operator=(FallbackTransform &&) = default;
-    void               run() override
-    {
-        _output.allocator()->allocate();
-        ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr);
-        _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b);
-        _reshape_run = true;
-    }
-
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    ITensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    uint32_t uid() override
-    {
-        uint32_t id = (_B_pretranspose_size | 0x80000000);
-        return id;
-    }
-
-    void configure(size_t B_pretranspose_size, unsigned int alignment)
-    {
-        _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
-        _B_pretranspose_size = B_pretranspose_size;
-    }
-
-    void set_pretranspose(ITensor *tensor)
-    {
-        if(!_reshape_run)
-        {
-            _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer());
-        }
-    }
-
-    void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm)
-    {
-        _ldb             = ldb;
-        _in1_ptr         = in1_ptr;
-        _multi_stride_b  = multi_stride_b;
-        _gemm_kernel_asm = gemm_kernel_asm;
-    }
-
-private:
-    Tensor           _output{};
-    int              _ldb{};
-    const TypeInput *_in1_ptr{};
-    int              _multi_stride_b{};
-    size_t           _B_pretranspose_size{};
-    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
-};
-
 /** Fallback in case ACL doesn't have a function */
 template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
 class Fallback : public CpuGemmAssemblyDispatch::IFallback
 {
 public:
     /** Destructor */
-    ~Fallback()
-    {
-        if(_pretranspose && !(is_weight_managed()))
-        {
-            delete _pretranspose;
-        }
-    }
+    ~Fallback() = default;
 
     /** Initialise the functions's input and output.
      *
-     * @param[in]  a               Input tensor containing the Matrix A.
-     * @param[in]  b               Input tensor containing the Matrix B.
-     * @param[in]  c               Input tensor containing the Matrix C.
-     * @param[out] d               Output tensor to store the result of matrix multiplication.
-     * @param[in]  args            Matrix multiplication information.
-     * @param[in]  gemm_info       GEMM meta-data
-     * @param[in]  memory_group    Memory group to be used by the function.
-     * @param[in]  weights_manager Weights manager to be used by the function.
-     * @param[in]  os              Output stage meta-data.
+     * @param[in]  a         Input tensor containing the Matrix A.
+     * @param[in]  b         Input tensor containing the Matrix B.
+     * @param[in]  c         Input tensor containing the Matrix C.
+     * @param[out] d         Output tensor to store the result of matrix multiplication.
+     * @param[in]  args      Matrix multiplication information.
+     * @param[in]  gemm_info GEMM meta-data
+     * @param[in]  os        Output stage meta-data.
      */
     void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
                    arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
-                   MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
+                   const OutputStage &os = {});
 
     /** Set requantization shifts to be used
      *
@@ -231,16 +158,17 @@ public:
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
     void prepare(ITensorPack &tensors) override;
-    bool is_configured() const override;
+    bool                             is_configured() const override;
+    experimental::MemoryRequirements workspace() const override;
 
 private:
-    /** Allocate a workspace tensor.
-     *
-     * @param[in] workspace_size Size to allocate.
-     * @param[in] memory_group   Tensor memory group.
-     * @param[in] alignment      Workspace memory alignment.
-     */
-    void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
+    enum AuxTensorIdx
+    {
+        AsmGemmWorkspace = 0,
+        Pretranspose,
+        Count
+    };
+
     /** Configure the indirect buffer
      *
      * @param[in]  a    Input tensor containing the Matrix A.
@@ -256,18 +184,14 @@ private:
     std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
     /** Optimised Arm® Neon™ kernel */
     std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
-    /** GEMM workspace */
-    Tensor _workspace{};
-    /** Pre-transpose tensor */
-    ITensor *_pretranspose{ nullptr };
+    /** Assembly GEMM workspace tensor info */
+    TensorInfo _workspace_info{};
+    /** Pre-transpose tensor info */
+    TensorInfo _pretranspose_info{};
     /** Prepared flag */
     bool _is_prepared{ false };
     /** GEMM meta-data */
     AsmGemmInfo _gemm_info{};
-    /** Weights manager */
-    IWeightsManager *_weights_manager{ nullptr };
-    /** Weights transform object */
-    FallbackTransform<TypeInput, TypeOutput> _weights_transform{};
     /** GEMM kernel description */
     arm_gemm::KernelDescription _kernel_info{};
     /** Per channel quantization shifts */
@@ -279,27 +203,9 @@ private:
     /** Indirect buffer */
     std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
     std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
-    std::vector<TypeInput>          _indirect_pad{};
-    arm_gemm::ConvolutionParameters _cp{};
-
-    bool is_weight_managed()
-    {
-        // TODO (COMPMID-4539): This function should do the following:
-        // _weights_manager && _weights_manager->are_weights_managed(_b)
-        // , where _b is the second Tensor that is used to be given to the configure().
-        // Currently, however, weight manager is disabled to make this class stateless.
-        // This should be revisited in the future.
-        return false;
-    }
-
-    void acquire_managed_weight()
-    {
-        // TODO (COMPMID-4539): This function should do the following:
-        // _pretranspose = _weights_manager->acquire(_b, &_weights_transform);
-        // , where _b is the second Tensor that is used to be given to the configure().
-        // Currently, however, weight manager is disabled to make this class stateless.
-        _pretranspose = nullptr;
-    }
+    std::vector<TypeInput>           _indirect_pad{};
+    arm_gemm::ConvolutionParameters  _cp{};
+    experimental::MemoryRequirements _aux_mem{ Count };
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -439,12 +345,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
                                                              arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
-                                                             MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
+                                                             const OutputStage &os)
 {
     ARM_COMPUTE_UNUSED(c);
     arm_gemm::GemmConfig gemm_cfg;
-    _kernel_info     = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
-    _weights_manager = weights_manager;
+    _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
     if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
     {
         gemm_cfg.filter = _kernel_info.name;
@@ -461,13 +366,10 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
     auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
     ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
     acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
-    const size_t workspace_size = _gemm_kernel_asm->get_working_size();
-    if(workspace_size > 0)
-    {
-        // Allocate workspace
-        const unsigned int alignment = 4096;
-        allocate_workspace(workspace_size, memory_group, alignment);
-    }
+    const size_t       workspace_size = _gemm_kernel_asm->get_working_size();
+    const unsigned int alignment      = 4096;
+    _workspace_info                   = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
+    _aux_mem[AsmGemmWorkspace]        = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
 
     //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
     //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
@@ -487,16 +389,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
-        if(is_weight_managed())
-        {
-            _weights_transform.configure(B_pretranspose_size, alignment);
-            acquire_managed_weight();
-        }
-        else
-        {
-            _pretranspose = new Tensor();
-            static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
-        }
+        _pretranspose_info                     = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
+        _aux_mem[Pretranspose]                 = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
     }
 
     // Handle indirect GEMM convolution
@@ -509,10 +403,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
 {
-    auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     if(!_is_prepared)
     {
+        auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+
         // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
         if(c && c->info()->data_type() == DataType::S32)
         {
@@ -526,24 +421,9 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
             const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
             const int  multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
 
-            if(is_weight_managed())
-            {
-                _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm);
-                _weights_manager->run(b, &_weights_transform);
-
-                // If we didn't run the reshape function, set the pretransposed buffer
-                if(!_weights_transform.is_reshape_run())
-                {
-                    _weights_transform.set_pretranspose(_pretranspose);
-                }
-            }
-            else
-            {
-                static_cast<Tensor *>(_pretranspose)->allocator()->allocate();
-                ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr);
-                _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
-                b->mark_as_unused();
-            }
+            CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
+            ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
+            _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
         }
 
         if(_gemm_info.method == AsmConvMethod::Indirect)
@@ -556,18 +436,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment)
+bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
 {
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
-    _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment);
-    memory_group.manage(&_workspace);
-    _workspace.allocator()->allocate();
+    return _optimised_kernel != nullptr;
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
+experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const
 {
-    return _optimised_kernel != nullptr;
+    return _aux_mem;
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -609,9 +486,10 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
 
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
-    if(_workspace.buffer() != nullptr)
+    CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
+    if(workspace.get()->buffer() != nullptr)
     {
-        _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
+        _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
         const unsigned int split_dim   = scheduling_hint.split_dimension();
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
         unsigned int       num_threads = NEScheduler::get().num_threads();
@@ -656,9 +534,9 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
 }
 
 template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
-                     IWeightsManager *weights_manager)
+void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+                     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
+                     arm_gemm::Activation activation, const AsmGemmInfo &info)
 {
     Params         p           = extract_parameters(a, b, d, info);
     const CPUInfo &ci          = NEScheduler::get().cpu_info();
@@ -668,14 +546,14 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
-    fallback->configure(a, b, c, d, args, info, memory_group, weights_manager);
+    fallback->configure(a, b, c, d, args, info);
     arm_gemm = std::move(fallback);
 }
 
 template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                           const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
-                           IWeightsManager *weights_manager)
+void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+                           const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
+                           arm_gemm::Activation activation, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_UNUSED(activation);
     Params         p           = extract_parameters(a, b, d, info);
@@ -713,14 +591,13 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
     }
 
     // Configure fallback
-    fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info);
+    fallback->configure(a, b, c, d, args, info, gemm_requant_info);
     arm_gemm = std::move(fallback);
 }
-
 } //namespace
 
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager)
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
+    : _arm_gemm(nullptr)
 {
 }
 
@@ -775,40 +652,40 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
     switch(a->data_type())
     {
         case DataType::F32:
-            create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
+            create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
             break;
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
             if(d->data_type() == DataType::S32)
             {
-                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
+                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
             }
             else
             {
-                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
+                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info);
             }
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
             if(d->data_type() == DataType::S32)
             {
-                create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
+                create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
             }
             else
             {
-                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
+                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info);
             }
             break;
 #endif /* __aarch64__ */
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
         case DataType::BFLOAT16:
-            create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
+            create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info);
             break;
 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
+            create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info);
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
@@ -829,10 +706,14 @@ bool CpuGemmAssemblyDispatch::is_configured() const
 
 void CpuGemmAssemblyDispatch::run(ITensorPack &tensors)
 {
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
     ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
     _arm_gemm->run(tensors);
 }
+
+experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const
+{
+    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+    return _arm_gemm->workspace();
+}
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index ffc097c75c..355273adeb 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -24,10 +24,6 @@
 #ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
 #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
 
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
 #include "src/core/common/Macros.h"
 #include "src/runtime/cpu/ICpuOperator.h"
 
@@ -62,7 +58,7 @@ class CpuGemmAssemblyDispatch : public ICpuOperator
 {
 public:
     /** Constructor */
-    CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CpuGemmAssemblyDispatch();
     /** Defautl destructor */
     ~CpuGemmAssemblyDispatch() = default;
 
@@ -71,10 +67,11 @@ public:
     class IFallback
     {
     public:
-        virtual void run(ITensorPack &tensors)     = 0;
-        virtual void prepare(ITensorPack &tensors) = 0;
-        virtual bool is_configured() const         = 0;
-        virtual ~IFallback()                       = default;
+        virtual void run(ITensorPack &tensors)                         = 0;
+        virtual void prepare(ITensorPack &tensors)                     = 0;
+        virtual experimental::MemoryRequirements workspace() const     = 0;
+        virtual bool                             is_configured() const = 0;
+        virtual ~IFallback()                                           = default;
     };
 
 public:
@@ -115,11 +112,10 @@ public:
     // Inherited methods overridden:
     void prepare(ITensorPack &tensors) override;
     void run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
 
 private:
-    std::unique_ptr<IFallback> _arm_gemm;        /**< Interface for the arm_gemm fallback */
-    MemoryGroup                _memory_group;    /**< Function memory group */
-    IWeightsManager           *_weights_manager; /**< Pointer to the weights manager */
+    std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 9e00da16ae..be01655a86 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -28,6 +28,8 @@
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/LargeConvolutionLayerDataset.h"
@@ -571,6 +573,99 @@ TEST_SUITE(DirectGEMMConv2d)
 template <typename T>
 using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConv2d, T>;
 
+/** Test case for memory injection in @ref cpu::CpuGemmDirectConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<cpu::CpuGemmDirectConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info);
+
+    // tensors are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        conv->prepare(prep_pack);
+        conv->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMMConv2d.
+ *
+ * Make sure @ref NEGEMMConv2d still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<NEGEMMConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    auto       run_conv    = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        conv->configure(&src, &weight, &bias, &dst, conv_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        conv->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-- 
cgit v1.2.1