aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp19
-rw-r--r--src/runtime/gpu/cl/utils/ClAuxTensorHandler.h10
2 files changed, 15 insertions, 14 deletions
diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
index 2ca1ff59df..07f90ddaef 100644
--- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -233,37 +233,32 @@ Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *wei
void ClWinogradConv2d::run(ITensorPack &tensors)
{
- prepare(tensors);
+ const bool is_gemm_reshaped = _aux_mem[3].lifetime == MemoryLifetime::Prepare;
auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true);
- CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true);
+ CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true, is_gemm_reshaped);
CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
+ prepare(tensors);
+
// Run input transform
ITensorPack pack_it
{
{ TensorType::ACL_SRC, src },
{ TensorType::ACL_DST, input0.get() },
};
- CLScheduler::get().enqueue_op(_border_handler, pack_it);
- CLScheduler::get().enqueue_op(*_input_transform, pack_it);
+ CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
+ CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
// Run batched matrix multiplication
ITensorPack pack_mm = tensors;
pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
- if(_aux_mem[3].lifetime == MemoryLifetime::Prepare)
- {
- pack_mm.remove_tensor(TensorType::ACL_SRC_1);
- }
- else
- {
- pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
- }
+ is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
_batched_mm.run(pack_mm);
// Run output transform
diff --git a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h b/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
index 1cf717cf6f..af383489a1 100644
--- a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
+++ b/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
@@ -28,6 +28,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/common/utils/Log.h"
#include "support/Cast.h"
namespace arm_compute
@@ -38,7 +39,7 @@ namespace opencl
class CLAuxTensorHandler
{
public:
- CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false)
+ CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
: _tensor()
{
if(info.total_size() == 0)
@@ -50,7 +51,12 @@ public:
ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
{
- _tensor.allocator()->allocate();
+ if(!bypass_alloc)
+ {
+ _tensor.allocator()->allocate();
+ ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
+ }
+
if(pack_inject)
{
pack.add_tensor(slot_id, &_tensor);