From b76eaed55a89330b3b448c4f4522b3fc94a4f38d Mon Sep 17 00:00:00 2001 From: Finn Williams Date: Wed, 31 Mar 2021 16:22:40 +0100 Subject: IVGCVSW-5793 Add default ExecuteAsync implementation to Workload Signed-off-by: Finn Williams Change-Id: If2069b4d274286e654ac2bceb52d147f9ee3a7a9 --- src/backends/backendsCommon/Workload.hpp | 16 +- src/backends/backendsCommon/test/CMakeLists.txt | 1 + .../test/DefaultAsyncExecuteTest.cpp | 249 +++++++++++++++++++++ 3 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 src/backends/backendsCommon/test/DefaultAsyncExecuteTest.cpp diff --git a/src/backends/backendsCommon/Workload.hpp b/src/backends/backendsCommon/Workload.hpp index 940b878d2f..87869c9841 100644 --- a/src/backends/backendsCommon/Workload.hpp +++ b/src/backends/backendsCommon/Workload.hpp @@ -37,7 +37,16 @@ public: m_Data.Validate(info); } - void ExecuteAsync(WorkingMemDescriptor&) override {}; + void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor) override + { + ARMNN_LOG(info) << "Using default async workload execution, this will network affect performance"; + std::lock_guard lockGuard(m_AsyncWorkloadMutex); + + m_Data.m_Inputs = workingMemDescriptor.m_Inputs; + m_Data.m_Outputs = workingMemDescriptor.m_Outputs; + + Execute(); + }; void PostAllocationConfigure() override {} @@ -46,8 +55,11 @@ public: profiling::ProfilingGuid GetGuid() const final { return m_Guid; } protected: - const QueueDescriptor m_Data; + QueueDescriptor m_Data; const profiling::ProfilingGuid m_Guid; + +private: + std::mutex m_AsyncWorkloadMutex; }; // TypedWorkload used diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt index 9d36f52b59..248ada92a2 100644 --- a/src/backends/backendsCommon/test/CMakeLists.txt +++ b/src/backends/backendsCommon/test/CMakeLists.txt @@ -15,6 +15,7 @@ list(APPEND armnnBackendsCommonUnitTests_sources ComparisonEndToEndTestImpl.hpp DataLayoutUtils.hpp DataTypeUtils.hpp + DefaultAsyncExecuteTest.cpp DepthToSpaceEndToEndTestImpl.hpp DequantizeEndToEndTestImpl.hpp DetectionPostProcessEndToEndTestImpl.hpp diff --git a/src/backends/backendsCommon/test/DefaultAsyncExecuteTest.cpp b/src/backends/backendsCommon/test/DefaultAsyncExecuteTest.cpp new file mode 100644 index 0000000000..0d4595210e --- /dev/null +++ b/src/backends/backendsCommon/test/DefaultAsyncExecuteTest.cpp @@ -0,0 +1,249 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include + +#include +#include + +#include + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(WorkloadAsyncExecuteTests) + +namespace +{ + +struct Workload0 : BaseWorkload +{ + Workload0(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info) + : BaseWorkload(descriptor, info) + { + } + + Workload0() : BaseWorkload(ElementwiseUnaryQueueDescriptor(), WorkloadInfo()) + { + } + + void Execute() const + { + int* inVals = static_cast(m_Data.m_Inputs[0][0].Map()); + int* outVals = static_cast(m_Data.m_Outputs[0][0].Map()); + + for (unsigned int i = 0; i < m_Data.m_Inputs[0][0].GetShape().GetNumElements(); ++i) + { + outVals[i] = inVals[i] * outVals[i]; + inVals[i] = outVals[i]; + } + } + + void ExecuteAsync(WorkingMemDescriptor& desc) + { + int* inVals = static_cast(desc.m_Inputs[0][0].Map()); + int* outVals = static_cast(desc.m_Outputs[0][0].Map()); + + for (unsigned int i = 0; i < desc.m_Inputs[0][0].GetShape().GetNumElements(); ++i) + { + outVals[i] = inVals[i] + outVals[i]; + inVals[i] = outVals[i]; + } + } + + QueueDescriptor* GetQueueDescriptor() + { + return &m_Data; + } +}; + +struct Workload1 : BaseWorkload +{ + Workload1(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info) + : BaseWorkload(descriptor, info) + { + } + + void Execute() const + { + int* inVals = static_cast(m_Data.m_Inputs[0][0].Map()); + int* outVals = static_cast(m_Data.m_Outputs[0][0].Map()); + + for (unsigned int i = 0; i < m_Data.m_Inputs[0][0].GetShape().GetNumElements(); ++i) + { + outVals[i] = inVals[i] * outVals[i]; + inVals[i] = outVals[i]; + } + } +}; + +void ValidateTensor(ITensorHandle* tensorHandle, int expectedValue) +{ + int* actualOutput = static_cast(tensorHandle->Map()); + + bool allValuesCorrect = true; + for (unsigned int i = 0; i < tensorHandle->GetShape().GetNumElements(); ++i) + { + if (actualOutput[i] != expectedValue) + { + allValuesCorrect = false; + } + } + + BOOST_CHECK(allValuesCorrect); +} + +template +std::unique_ptr CreateWorkload(TensorInfo info, ITensorHandle* inputTensor, ITensorHandle* outputTensor) +{ + WorkloadInfo workloadInfo; + workloadInfo.m_InputTensorInfos = std::vector{info}; + workloadInfo.m_OutputTensorInfos = std::vector{info}; + + ElementwiseUnaryQueueDescriptor elementwiseUnaryQueueDescriptor; + elementwiseUnaryQueueDescriptor.m_Inputs = std::vector{inputTensor}; + elementwiseUnaryQueueDescriptor.m_Outputs = std::vector{outputTensor}; + + return std::make_unique(elementwiseUnaryQueueDescriptor, workloadInfo); +} + +BOOST_AUTO_TEST_CASE(TestAsyncExecute) +{ + TensorInfo info({5}, DataType::Signed32); + + int inVals[5]{2, 2, 2, 2, 2}; + int outVals[5]{1, 1, 1, 1, 1}; + + int expectedExecuteval = 2; + int expectedExecuteAsyncval = 3; + + ConstTensor constInputTensor(info, inVals); + ConstTensor constOutputTensor(info, outVals); + + ScopedCpuTensorHandle syncInput0(constInputTensor); + ScopedCpuTensorHandle syncOutput0(constOutputTensor); + + std::unique_ptr workload0 = CreateWorkload(info, &syncInput0, &syncOutput0); + + workload0.get()->Execute(); + + ScopedCpuTensorHandle asyncInput0(constInputTensor); + ScopedCpuTensorHandle asyncOutput0(constOutputTensor); + + WorkingMemDescriptor workingMemDescriptor0; + workingMemDescriptor0.m_Inputs = std::vector{&asyncInput0}; + workingMemDescriptor0.m_Outputs = std::vector{&asyncOutput0}; + + workload0.get()->ExecuteAsync(workingMemDescriptor0); + + // Inputs are also changed by the execute/executeAsync calls to make sure there is no interference with them + ValidateTensor(workingMemDescriptor0.m_Outputs[0], expectedExecuteAsyncval); + ValidateTensor(workingMemDescriptor0.m_Inputs[0], expectedExecuteAsyncval); + + ValidateTensor(&workload0.get()->GetQueueDescriptor()->m_Outputs[0][0], expectedExecuteval); + ValidateTensor(&workload0.get()->GetQueueDescriptor()->m_Inputs[0][0], expectedExecuteval); +} + +BOOST_AUTO_TEST_CASE(TestDefaultAsyncExecute) +{ + TensorInfo info({5}, DataType::Signed32); + + std::vector inVals{2, 2, 2, 2, 2}; + std::vector outVals{1, 1, 1, 1, 1}; + std::vector defaultVals{0, 0, 0, 0, 0}; + + int expectedExecuteval = 2; + + ConstTensor constInputTensor(info, inVals); + ConstTensor constOutputTensor(info, outVals); + ConstTensor defaultTensor(info, &defaultVals); + + ScopedCpuTensorHandle defaultInput = ScopedCpuTensorHandle(defaultTensor); + ScopedCpuTensorHandle defaultOutput = ScopedCpuTensorHandle(defaultTensor); + + std::unique_ptr workload1 = CreateWorkload(info, &defaultInput, &defaultOutput); + + ScopedCpuTensorHandle asyncInput(constInputTensor); + ScopedCpuTensorHandle asyncOutput(constOutputTensor); + + WorkingMemDescriptor workingMemDescriptor; + workingMemDescriptor.m_Inputs = std::vector{&asyncInput}; + workingMemDescriptor.m_Outputs = std::vector{&asyncOutput}; + + workload1.get()->ExecuteAsync(workingMemDescriptor); + + // workload1 has no AsyncExecute implementation and so should use the default workload AsyncExecute + // implementation which will call workload1.Execute() in a thread safe manner + ValidateTensor(workingMemDescriptor.m_Outputs[0], expectedExecuteval); + ValidateTensor(workingMemDescriptor.m_Inputs[0], expectedExecuteval); +} + +BOOST_AUTO_TEST_CASE(TestDefaultAsyncExeuteWithThreads) +{ + // Use a large vector so the threads have a chance to interact + unsigned int vecSize = 1000; + TensorInfo info({vecSize}, DataType::Signed32); + + std::vector inVals1(vecSize, 2); + std::vector outVals1(vecSize, 1); + std::vector inVals2(vecSize, 5); + std::vector outVals2(vecSize, -1); + + std::vector defaultVals(vecSize, 0); + + int expectedExecuteval1 = 4; + int expectedExecuteval2 = 25; + ConstTensor constInputTensor1(info, inVals1); + ConstTensor constOutputTensor1(info, outVals1); + + ConstTensor constInputTensor2(info, inVals2); + ConstTensor constOutputTensor2(info, outVals2); + + ConstTensor defaultTensor(info, &defaultVals); + + ScopedCpuTensorHandle defaultInput = ScopedCpuTensorHandle(defaultTensor); + ScopedCpuTensorHandle defaultOutput = ScopedCpuTensorHandle(defaultTensor); + std::unique_ptr workload = CreateWorkload(info, &defaultInput, &defaultOutput); + + ScopedCpuTensorHandle asyncInput1(constInputTensor1); + ScopedCpuTensorHandle asyncOutput1(constOutputTensor1); + + WorkingMemDescriptor workingMemDescriptor1; + workingMemDescriptor1.m_Inputs = std::vector{&asyncInput1}; + workingMemDescriptor1.m_Outputs = std::vector{&asyncOutput1}; + + + ScopedCpuTensorHandle asyncInput2(constInputTensor2); + ScopedCpuTensorHandle asyncOutput2(constOutputTensor2); + + WorkingMemDescriptor workingMemDescriptor2; + workingMemDescriptor2.m_Inputs = std::vector{&asyncInput2}; + workingMemDescriptor2.m_Outputs = std::vector{&asyncOutput2}; + + std::thread thread1 = std::thread([&]() + { + workload.get()->ExecuteAsync(workingMemDescriptor1); + workload.get()->ExecuteAsync(workingMemDescriptor1); + }); + + std::thread thread2 = std::thread([&]() + { + workload.get()->ExecuteAsync(workingMemDescriptor2); + workload.get()->ExecuteAsync(workingMemDescriptor2); + }); + + thread1.join(); + thread2.join(); + + ValidateTensor(workingMemDescriptor1.m_Outputs[0], expectedExecuteval1); + ValidateTensor(workingMemDescriptor1.m_Inputs[0], expectedExecuteval1); + + ValidateTensor(workingMemDescriptor2.m_Outputs[0], expectedExecuteval2); + ValidateTensor(workingMemDescriptor2.m_Inputs[0], expectedExecuteval2); +} + + +BOOST_AUTO_TEST_SUITE_END() + +} \ No newline at end of file -- cgit v1.2.1