3 files changed, 141 insertions, 30 deletions
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 31c873c2ba..7460f2020c 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -839,23 +839,26 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
     auto weights = gemm_pack.get_const_tensor(TensorType::ACL_SRC_1);
     ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
     // Re-interpreted weights. Only tensor shape is changed. Only memory import, no allocation
+    const bool use_reinterpreted_wei = (_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose);
     CpuAuxTensorHandler reinterpreted_wei(
         _weights_reshaped, *weights,
         /* import only if we chose the ReinterpretThenTranspose path, because otherwise the weight may have been freed */
-        !(_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose));
-    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
+        !use_reinterpreted_wei);
+
+    const bool          use_reshaped_wei = (_run_wt && (_wt_method == WeightTransformMethod::ReshapeThenTranspose ||
+                                               _wt_method == WeightTransformMethod::FusedReshapeAndTranspose));
+    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors,
+                                     false /* pack_inject */, !use_reshaped_wei /* bypass_alloc */,
+                                     !use_reshaped_wei /* bypass_import */
+    );
     // Update the weights to use if it has been reshaped
-    if (_run_wt)
+    if (use_reinterpreted_wei)
     {
-        if (_wt_method == WeightTransformMethod::ReinterpretThenTranspose)
-        {
-            gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
-        }
-        else if (_wt_method == WeightTransformMethod::ReshapeThenTranspose ||
-                 _wt_method == WeightTransformMethod::FusedReshapeAndTranspose)
-        {
-            gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
-        }
+        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
+    }
+    else if (use_reshaped_wei)
+    {
+        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
     }
 
     // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h
index 0a39fdba81..3b980ce60b 100644
--- a/src/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/cpu/utils/CpuAuxTensorHandler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,12 +35,74 @@ namespace arm_compute
 {
 namespace cpu
 {
-/* Tensor handler to wrap and handle tensor allocations on workspace buffers */
+/** Tensor handler to wrap and handle tensor allocations on workspace buffers
+ *
+ * @note Important: Despite the impression given by its name, the handler owns, rather than merely points to, the
+ *       underlying tensor memory.
+ *
+ * @note About memory handling using bypass_* flags
+ * The bypass_alloc / bypass_import flags are meant to skip the expensive auxiliary tensor memory allocations or
+ * imports that are not needed during runtime, e.g. when the handler is not used at all in some branch of execution.
+ *
+ * If not handled correctly, these two flags can lead to performance issues (not bypass when needed to), or memory
+ * bugs (bypass when should not to).
+ *
+ * Make sure:
+ *
+ * 1. The aux tensor handlers must always be declared at the root level, or the same level as the run/prepare
+ *    methods that potentially use them.
+ *
+ *    Once the handler is destroyed (e.g. when going out of scope), the memory it owns (returned by the get()
+ *    method) will also be destroyed.
+ *
+ *    Thus it's important to ensure the handler is always in-scope when it is being used by a operator / kernel.
+ *
+ * 2. The handler's bypass_alloc and bypass_import flags should always be inverse of whether the handler is used in
+ *    its surrounding scope by run/prepare. (This usually means being added to some tensor pack)
+ *
+ *    This ensures we only bypass if and only if the aux tensor is not used by the op / kernel later.
+ *
+ *
+ * So the general usage pattern goes like this:
+ *
+ *      bool use_aux_tensor =  some_condition_about_when_to_use_the_aux_tensor
+ *
+ *      CpuAuxTensorHandler aux_handler {..., !use_aux_tensor || bypass_alloc / bypass_import ||};
+ *
+ *      if (use_aux_tensor)
+ *      {
+ *          tensor_pack.add_tensor(aux_handler.get());
+ *      }
+ *      op.run(tensor_pack);
+ */
 class CpuAuxTensorHandler
 {
 public:
-    CpuAuxTensorHandler(
-        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    /** Create a temporary tensor handle, by either important an existing tensor from a tensor pack, or allocating a
+     *  new one.
+     *
+     * @param[in]     slot_id       Slot id of the tensor to be retrieved in the tensor pack
+     *                              If no such tensor exists in the tensor pack, a new tensor will be allocated.
+     * @param[in]     info          Tensor info containing requested size of the new tensor.
+     *                              If requested size is larger than the tensor retrieved from the tensor pack,
+     *                              a new tensor will be allocated.
+     * @param[in,out] pack          Tensor pack to retrieve the old tensor. When @p pack_inject is true, the new
+     *                              tensor will also be added here.
+     * @param[in]     pack_inject   In case of a newly allocated tensor, whether to add this tensor back to the
+     *                              @p pack
+     * @param[in]     bypass_alloc  Bypass allocation in case of a new tensor
+     *                              This is to prevent unnecessary memory operations when the handler object is not
+     *                              used
+     * @param[in]     bypass_import Bypass importation in case of a retrieved tensor
+     *                                  This is to prevent unnecessary memory operations when the handler object is not
+     *                                  used
+     */
+    CpuAuxTensorHandler(int          slot_id,
+                        TensorInfo  &info,
+                        ITensorPack &pack,
+                        bool         pack_inject   = false,
+                        bool         bypass_alloc  = false,
+                        bool         bypass_import = false)
         : _tensor()
     {
         if (info.total_size() == 0)
@@ -67,7 +129,10 @@ public:
         }
         else
         {
-            _tensor.allocator()->import_memory(packed_tensor->buffer());
+            if (!bypass_import)
+            {
+                _tensor.allocator()->import_memory(packed_tensor->buffer());
+            }
         }
     }
 
@@ -76,7 +141,8 @@ public:
      *
      * @param[in] info          New tensor info to "assign" to @p tensor
      * @param[in] tensor        Tensor to be assigned a new @ref TensorInfo
-     * @param[in] bypass_import Bypass importing @p tensor's memory into the handler
+     * @param[in] bypass_import Bypass importing @p tensor's memory into the handler.
+     *                          This is to prevent unnecessary memory operations when the handler object is not used
      */
     CpuAuxTensorHandler(TensorInfo &info, const ITensor &tensor, bool bypass_import = false) : _tensor()
     {
diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h
index 81dc3baef4..12226699f8 100644
--- a/src/gpu/cl/utils/ClAuxTensorHandler.h
+++ b/src/gpu/cl/utils/ClAuxTensorHandler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H
-#define ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H
+#ifndef ACL_SRC_GPU_CL_UTILS_CLAUXTENSORHANDLER_H
+#define ACL_SRC_GPU_CL_UTILS_CLAUXTENSORHANDLER_H
 
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -35,12 +35,39 @@ namespace arm_compute
 {
 namespace opencl
 {
-/* Tensor handler to wrap and handle tensor allocations on workspace buffers */
+/** Tensor handler to wrap and handle tensor allocations on workspace buffers
+ *
+ * @note About memory handling using bypass_* flags
+ *       See @ref arm_compute::cpu::CpuAuxTensorHandler
+ */
 class CLAuxTensorHandler
 {
 public:
-    CLAuxTensorHandler(
-        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    /** Create a temporary tensor handle, by either important an existing tensor from a tensor pack, or allocating a
+     *  new one.
+     *
+     * @param[in]     slot_id       Slot id of the tensor to be retrieved in the tensor pack
+     *                              If no such tensor exists in the tensor pack, a new tensor will be allocated.
+     * @param[in]     info          Tensor info containing requested size of the new tensor.
+     *                              If requested size is larger than the tensor retrieved from the tensor pack,
+     *                              a new tensor will be allocated.
+     * @param[in,out] pack          Tensor pack to retrieve the old tensor. When @p pack_inject is true, the new
+     *                              tensor will also be added here.
+     * @param[in]     pack_inject   In case of a newly allocated tensor, whether to add this tensor back to the
+     *                              @p pack
+     * @param[in]     bypass_alloc  Bypass allocation in case of a new tensor
+     *                              This is to prevent unnecessary memory operations when the handler object is not
+     *                              used
+     * @param[in]     bypass_import Bypass importation in case of a retrieved tensor
+     *                                  This is to prevent unnecessary memory operations when the handler object is not
+     *                                  used
+     */
+    CLAuxTensorHandler(int          slot_id,
+                       TensorInfo  &info,
+                       ITensorPack &pack,
+                       bool         pack_inject   = false,
+                       bool         bypass_alloc  = false,
+                       bool         bypass_import = false)
         : _tensor()
     {
         if (info.total_size() == 0)
@@ -67,16 +94,31 @@ public:
         }
         else
         {
-            _tensor.allocator()->import_memory(packed_tensor->cl_buffer());
+            if (!bypass_import)
+            {
+                _tensor.allocator()->import_memory(packed_tensor->cl_buffer());
+            }
         }
     }
 
-    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) : _tensor()
+    /** Create a temporary handle to the original tensor with a new @ref TensorInfo
+     * This is useful if we want to change a tensor's tensor info at run time without modifying the original tensor
+     *
+     * @param[in] info          New tensor info to "assign" to @p tensor
+     * @param[in] tensor        Tensor to be assigned a new @ref TensorInfo
+     * @param[in] bypass_import Bypass importing @p tensor's memory into the handler.
+     *                          This is to prevent unnecessary memory operations when the handler object is not used
+     */
+    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor, bool bypass_import = false) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if (info.total_size() <= tensor.info()->total_size())
+        if (!bypass_import)
         {
-            _tensor.allocator()->import_memory(tensor.cl_buffer());
+            ARM_COMPUTE_ERROR_ON(tensor.info() == nullptr);
+            if (info.total_size() <= tensor.info()->total_size())
+            {
+                _tensor.allocator()->import_memory(tensor.cl_buffer());
+            }
         }
     }
 
@@ -108,4 +150,4 @@ private:
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
+#endif // ACL_SRC_GPU_CL_UTILS_CLAUXTENSORHANDLER_H