aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-08 18:14:45 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-16 14:39:47 +0000
commit2b147ee857eb237613670460c52efedd43601955 (patch)
tree2c2f66754dca6d83e4967daae600e84bca8ca6b4 /src/runtime
parentd0c5df2695e6e30d600c0339f547373c0c6667b0 (diff)
downloadComputeLibrary-2b147ee857eb237613670460c52efedd43601955.tar.gz
Avoid multiple Rhs matrix transformation on ClGemm
ClWinogradConv2d was performing Rhs transformation on every step impacting the performance. Adds scope logging support through ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME Resolves: COMPMID-4596 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: Ib329d3bc8d8aa21abae9fabfe61de35cc84d4819 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5925 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp11
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp4
-rw-r--r--src/runtime/NEON/functions/NEGEMMConv2d.cpp4
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp4
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp4
-rw-r--r--src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp4
-rw-r--r--src/runtime/gpu/cl/operators/ClGemm.cpp4
-rw-r--r--src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp32
8 files changed, 42 insertions, 25 deletions
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index f758c3d0b3..fa01c914c5 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -41,7 +41,6 @@ struct CLWinogradConvolutionLayer::Impl
ICLTensor *dst{ nullptr };
std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
ITensorPack run_pack{};
- ITensorPack prep_pack{};
MemoryGroup memory_group{};
WorkspaceData<CLTensor> workspace_tensors{};
bool is_prepared{ false };
@@ -80,9 +79,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
{ TensorType::ACL_SRC_2, _impl->biases },
{ TensorType::ACL_DST, _impl->dst }
};
-
- _impl->prep_pack = { { TensorType::ACL_SRC_1, _impl->weights } };
- _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -102,7 +99,11 @@ void CLWinogradConvolutionLayer::prepare()
{
if(!_impl->is_prepared)
{
- _impl->op->prepare(_impl->prep_pack);
+ _impl->op->prepare(_impl->run_pack);
+
+ // Release Preparation tensors
+ release_prepare_tensors(_impl->workspace_tensors, _impl->run_pack);
+ _impl->run_pack.remove_tensor(TensorType::ACL_SRC_1);
_impl->is_prepared = true;
}
}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 168d93022f..4bf330fa1e 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -114,12 +114,12 @@ void NEGEMM::prepare()
// Release temporary tensors that are only used in prepare stage
for(auto &ws : _impl->workspace)
{
- const int slot = ws.first;
+ const int slot = ws.slot;
for(auto &m : _impl->aux_mem_req)
{
if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
{
- auto tensor = ws.second.get();
+ auto tensor = ws.tensor.get();
tensor->allocator()->free();
break;
}
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 3ca5239ae3..7e2ce70444 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -104,12 +104,12 @@ void NEGEMMConv2d::prepare()
// Release temporary tensors that are only used in prepare stage
for(auto &ws : _impl->workspace)
{
- const int slot = ws.first;
+ const int slot = ws.slot;
for(auto &m : _impl->aux_mem_req)
{
if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
{
- auto tensor = ws.second.get();
+ auto tensor = ws.tensor.get();
tensor->allocator()->free();
break;
}
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 6386a678db..23ffbce954 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -107,12 +107,12 @@ void NEGEMMConvolutionLayer::prepare()
}
for(auto &ws : _impl->workspace_tensors)
{
- const int slot = ws.first;
+ const int slot = ws.slot;
for(auto &m : _impl->aux_mem_req)
{
if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
{
- auto tensor = ws.second.get();
+ auto tensor = ws.tensor.get();
tensor->allocator()->free();
break;
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 641a2c2b5f..64507495ca 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -110,12 +110,12 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
// Release temporary tensors that are only used in prepare stage
for(auto &ws : _impl->workspace_tensors)
{
- const int slot = ws.first;
+ const int slot = ws.slot;
for(auto &m : _impl->aux_mem_req)
{
if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
{
- auto tensor = ws.second.get();
+ auto tensor = ws.tensor.get();
tensor->allocator()->free();
break;
}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 745179c050..b91048a426 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -99,12 +99,12 @@ void NEWinogradConvolutionLayer::prepare()
// Release temporary tensors that are only used in prepare stage
for(auto &ws : _impl->workspace)
{
- const int slot = ws.first;
+ const int slot = ws.slot;
for(auto &m : _impl->aux_mem_req)
{
if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
{
- auto tensor = ws.second.get();
+ auto tensor = ws.tensor.get();
tensor->allocator()->free();
break;
}
diff --git a/src/runtime/gpu/cl/operators/ClGemm.cpp b/src/runtime/gpu/cl/operators/ClGemm.cpp
index a80375447d..cb0eecae4b 100644
--- a/src/runtime/gpu/cl/operators/ClGemm.cpp
+++ b/src/runtime/gpu/cl/operators/ClGemm.cpp
@@ -37,6 +37,8 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/gpu/cl/IClKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/MemoryHelpers.h"
@@ -744,6 +746,8 @@ void ClGemm::prepare(ITensorPack &constants)
// If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
{
+ ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
+
CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
index c8db697778..2ca1ff59df 100644
--- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -212,9 +212,15 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso
// Configure output transform
_output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
- _aux_mem = _batched_mm.workspace();
+ _aux_mem = _batched_mm.workspace();
+ const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
+ {
+ return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
+ }) ?
+ MemoryLifetime::Prepare :
+ MemoryLifetime::Persistent;
_aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
- _aux_mem.push_back(MemoryInfo(offset_int_vec(3), MemoryLifetime::Persistent, _input1.total_size()));
+ _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
_aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
}
@@ -229,7 +235,6 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
{
prepare(tensors);
- // Run input transform
auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
@@ -238,6 +243,7 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true);
CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
+ // Run input transform
ITensorPack pack_it
{
{ TensorType::ACL_SRC, src },
@@ -247,12 +253,17 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
CLScheduler::get().enqueue_op(*_input_transform, pack_it);
// Run batched matrix multiplication
- ITensorPack pack_mm
+ ITensorPack pack_mm = tensors;
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
+ pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
+ if(_aux_mem[3].lifetime == MemoryLifetime::Prepare)
{
- { TensorType::ACL_SRC_0, input0.get() },
- { TensorType::ACL_SRC_1, input1.get() },
- { TensorType::ACL_DST, batched_mm_output.get() },
- };
+ pack_mm.remove_tensor(TensorType::ACL_SRC_1);
+ }
+ else
+ {
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+ }
_batched_mm.run(pack_mm);
// Run output transform
@@ -282,9 +293,10 @@ void ClWinogradConv2d::prepare(ITensorPack &tensors)
CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
weights->mark_as_unused();
- tensors.add_tensor(ACL_SRC_1, input1.get());
// Prepare GEMM and release reshaped weights if marked unused by ClGemm
- _batched_mm.prepare(tensors);
+ ITensorPack mm_prepare_pack = tensors;
+ mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get());
+ _batched_mm.prepare(mm_prepare_pack);
CLScheduler::get().queue().finish();
_is_prepared = true;