diff options
author | Aron Virginas-Tar <Aron.Virginas-Tar@arm.com> | 2018-10-12 13:00:55 +0100 |
---|---|---|
committer | Matthew Bentham <matthew.bentham@arm.com> | 2018-10-22 16:57:53 +0100 |
commit | 3b278e9261bd0de67c82f7d6c36731f118124f52 (patch) | |
tree | 3750ee01827809141752302e94d4d25a21f88492 /src/backends/cl | |
parent | d3360cd490eafc76ceddb6760054bd80444179c1 (diff) | |
download | armnn-3b278e9261bd0de67c82f7d6c36731f118124f52.tar.gz |
IVGCVSW-1938: Move backend-specific source files to the corresponding backend
Change-Id: I558a9a007604afc55e536d877f8da7d0215cc9c3
Diffstat (limited to 'src/backends/cl')
-rw-r--r-- | src/backends/cl/CMakeLists.txt | 5 | ||||
-rw-r--r-- | src/backends/cl/OpenClTimer.cpp | 105 | ||||
-rw-r--r-- | src/backends/cl/OpenClTimer.hpp | 59 | ||||
-rw-r--r-- | src/backends/cl/backend.mk | 1 | ||||
-rw-r--r-- | src/backends/cl/test/CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/backends/cl/test/ClCreateWorkloadTests.cpp | 6 | ||||
-rw-r--r-- | src/backends/cl/test/ClMemCopyTests.cpp | 39 | ||||
-rw-r--r-- | src/backends/cl/test/Fp16SupportTest.cpp | 112 | ||||
-rw-r--r-- | src/backends/cl/test/OpenClTimerTest.cpp | 143 | ||||
-rw-r--r-- | src/backends/cl/workloads/ClWorkloadUtils.hpp | 4 |
10 files changed, 472 insertions, 4 deletions
diff --git a/src/backends/cl/CMakeLists.txt b/src/backends/cl/CMakeLists.txt index 04da6ddcff..2f32081dfe 100644 --- a/src/backends/cl/CMakeLists.txt +++ b/src/backends/cl/CMakeLists.txt @@ -15,6 +15,11 @@ list(APPEND armnnClBackend_sources ) if(ARMCOMPUTECL) + list(APPEND armnnClBackend_sources + OpenClTimer.cpp + OpenClTimer.hpp + ) + add_subdirectory(workloads) add_subdirectory(test) endif() diff --git a/src/backends/cl/OpenClTimer.cpp b/src/backends/cl/OpenClTimer.cpp new file mode 100644 index 0000000000..57552d7bd9 --- /dev/null +++ b/src/backends/cl/OpenClTimer.cpp @@ -0,0 +1,105 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "OpenClTimer.hpp" + +#include <string> +#include <sstream> + +namespace armnn +{ + +OpenClTimer::OpenClTimer() +{ +} + +void OpenClTimer::Start() +{ + m_Kernels.clear(); + + auto interceptor = [this]( cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *gwo, + const size_t *gws, + const size_t *lws, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) + { + cl_int retVal = 0; + + // Get the name of the kernel + cl::Kernel retainedKernel(kernel, true); + std::stringstream ss; + ss << retainedKernel.getInfo<CL_KERNEL_FUNCTION_NAME>(); + + // Embed workgroup sizes into the name + if(gws != nullptr) + { + ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]"; + } + if(lws != nullptr) + { + ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]"; + } + + cl_event customEvent; + + // Forward to original OpenCl function + retVal = m_OriginalEnqueueFunction( command_queue, + kernel, + work_dim, + gwo, + gws, + lws, + num_events_in_wait_list, + event_wait_list, + &customEvent); + + // Store the Kernel info for later GetMeasurements() call + m_Kernels.emplace_back(ss.str(), customEvent); + + return retVal; + }; + + m_OriginalEnqueueFunction = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; +} + +void OpenClTimer::Stop() +{ + CLSymbols::get().clEnqueueNDRangeKernel_ptr = m_OriginalEnqueueFunction; +} + +std::vector<Measurement> OpenClTimer::GetMeasurements() const +{ + std::vector<Measurement> measurements; + + cl_command_queue_properties clQueueProperties = CLScheduler::get().queue().getInfo<CL_QUEUE_PROPERTIES>(); + + int idx = 0; + for (auto& kernel : m_Kernels) + { + std::string name = std::string(this->GetName()) + "/" + std::to_string(idx++) + ": " + kernel.m_Name; + + double timeUs = 0.0; + if((clQueueProperties & CL_QUEUE_PROFILING_ENABLE) != 0) + { + // Wait for the event to finish before accessing profile results. + kernel.m_Event.wait(); + + cl_ulong start = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); + cl_ulong end = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); + timeUs = static_cast<double>(end - start) / 1000.0; + } + + measurements.emplace_back(name, timeUs, Measurement::Unit::TIME_US); + } + + return measurements; +} + +} //namespace armnn diff --git a/src/backends/cl/OpenClTimer.hpp b/src/backends/cl/OpenClTimer.hpp new file mode 100644 index 0000000000..a7ae1387d9 --- /dev/null +++ b/src/backends/cl/OpenClTimer.hpp @@ -0,0 +1,59 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "Instrument.hpp" + +#include <arm_compute/runtime/CL/CLScheduler.h> +#include <arm_compute/core/CL/OpenCL.h> + +#include <vector> +#include <list> + +namespace armnn +{ + +/// OpenClTimer instrument that times all OpenCl kernels executed between calls to Start() and Stop(). +class OpenClTimer : public Instrument +{ +public: + OpenClTimer(); + ~OpenClTimer() = default; + + /// Start the OpenCl timer + void Start() override; + + /// Stop the OpenCl timer + void Stop() override; + + /// Get the name of the timer + /// \return Name of the timer + const char* GetName() const override { return "OpenClKernelTimer"; } + + /// Get the recorded measurements. This will be a list of the execution durations for all the OpenCl kernels. + /// \return Recorded measurements + std::vector<Measurement> GetMeasurements() const override; + +private: + using CLScheduler = arm_compute::CLScheduler; + using CLSymbols = arm_compute::CLSymbols; + using ClEvent = cl::Event; + using ClEnqueueFunc = decltype(CLSymbols::clEnqueueNDRangeKernel_ptr); + + /// Stores info about the OpenCl kernel + struct KernelInfo + { + KernelInfo(const std::string& name, cl_event& event) : m_Name(name), m_Event(event) {} + + std::string m_Name; + ClEvent m_Event; + }; + + std::list<KernelInfo> m_Kernels; ///< List of all kernels executed + ClEnqueueFunc m_OriginalEnqueueFunction; ///< Keep track of original OpenCl function +}; + +} //namespace armnn
\ No newline at end of file diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk index 4375d9496c..205f7b5415 100644 --- a/src/backends/cl/backend.mk +++ b/src/backends/cl/backend.mk @@ -12,6 +12,7 @@ BACKEND_SOURCES := \ ClContextControl.cpp \ ClLayerSupport.cpp \ ClWorkloadFactory.cpp \ + OpenClTimer.cpp \ workloads/ClActivationWorkload.cpp \ workloads/ClAdditionWorkload.cpp \ workloads/ClBatchNormalizationFloatWorkload.cpp \ diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt index d365290a6c..4936a78645 100644 --- a/src/backends/cl/test/CMakeLists.txt +++ b/src/backends/cl/test/CMakeLists.txt @@ -8,6 +8,8 @@ list(APPEND armnnClBackendUnitTests_sources ClCreateWorkloadTests.cpp ClLayerSupportTests.cpp ClLayerTests.cpp + ClMemCopyTests.cpp + OpenClTimerTest.cpp ) add_library(armnnClBackendUnitTests OBJECT ${armnnClBackendUnitTests_sources}) diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp index 66c2c2aa40..526dc68fc5 100644 --- a/src/backends/cl/test/ClCreateWorkloadTests.cpp +++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp @@ -6,13 +6,15 @@ #include "ClContextControlFixture.hpp" #include <backends/MemCopyWorkload.hpp> + +#include <backends/aclCommon/test/CreateWorkloadClNeon.hpp> + #include <backends/cl/ClTensorHandle.hpp> #include <backends/cl/ClWorkloadFactory.hpp> #include <backends/cl/workloads/ClWorkloads.hpp> #include <backends/cl/workloads/ClWorkloadUtils.hpp> -#include <backends/reference/RefWorkloadFactory.hpp> -#include <test/CreateWorkloadClNeon.hpp> +#include <backends/reference/RefWorkloadFactory.hpp> boost::test_tools::predicate_result CompareIClTensorHandleShape(IClTensorHandle* tensorHandle, std::initializer_list<unsigned int> expectedDimensions) diff --git a/src/backends/cl/test/ClMemCopyTests.cpp b/src/backends/cl/test/ClMemCopyTests.cpp new file mode 100644 index 0000000000..af8a36d6c0 --- /dev/null +++ b/src/backends/cl/test/ClMemCopyTests.cpp @@ -0,0 +1,39 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include <backends/cl/ClWorkloadFactory.hpp> +#include <backends/reference/RefWorkloadFactory.hpp> + +#include <backends/test/MemCopyTestImpl.hpp> + +#include <boost/test/unit_test.hpp> + +BOOST_AUTO_TEST_SUITE(ClMemCopy) + +BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpu) +{ + LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(false); + BOOST_TEST(CompareTensors(result.output, result.outputExpected)); +} + +BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpu) +{ + LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(false); + BOOST_TEST(CompareTensors(result.output, result.outputExpected)); +} + +BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpuWithSubtensors) +{ + LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(true); + BOOST_TEST(CompareTensors(result.output, result.outputExpected)); +} + +BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpuWithSubtensors) +{ + LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(true); + BOOST_TEST(CompareTensors(result.output, result.outputExpected)); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/backends/cl/test/Fp16SupportTest.cpp b/src/backends/cl/test/Fp16SupportTest.cpp new file mode 100644 index 0000000000..90bef3647b --- /dev/null +++ b/src/backends/cl/test/Fp16SupportTest.cpp @@ -0,0 +1,112 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include <armnn/ArmNN.hpp> +#include <armnn/Descriptors.hpp> +#include <armnn/IRuntime.hpp> +#include <armnn/INetwork.hpp> +#include <armnnUtils/Half.hpp> + +#include <Graph.hpp> +#include <Optimizer.hpp> +#include <backends/CpuTensorHandle.hpp> +#include <backends/test/QuantizeHelper.hpp> + +#include <boost/core/ignore_unused.hpp> +#include <boost/test/unit_test.hpp> + +#include <set> + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(Fp16Support) + +BOOST_AUTO_TEST_CASE(Fp16DataTypeSupport) +{ + Graph graph; + + Layer* const inputLayer1 = graph.AddLayer<InputLayer>(1, "input1"); + Layer* const inputLayer2 = graph.AddLayer<InputLayer>(2, "input2"); + + Layer* const additionLayer = graph.AddLayer<AdditionLayer>("addition"); + Layer* const outputLayer = graph.AddLayer<armnn::OutputLayer>(0, "output"); + + TensorInfo fp16TensorInfo({1, 2, 3, 5}, armnn::DataType::Float16); + inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0)); + inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1)); + additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + + inputLayer1->GetOutputSlot().SetTensorInfo(fp16TensorInfo); + inputLayer2->GetOutputSlot().SetTensorInfo(fp16TensorInfo); + additionLayer->GetOutputSlot().SetTensorInfo(fp16TensorInfo); + + BOOST_CHECK(inputLayer1->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16); + BOOST_CHECK(inputLayer2->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16); + BOOST_CHECK(additionLayer->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16); +} + +BOOST_AUTO_TEST_CASE(Fp16AdditionTest) +{ + using namespace half_float::literal; + // Create runtime in which test will run + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* inputLayer1 = net->AddInputLayer(0); + IConnectableLayer* inputLayer2 = net->AddInputLayer(1); + IConnectableLayer* additionLayer = net->AddAdditionLayer(); + IConnectableLayer* outputLayer = net->AddOutputLayer(0); + + inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0)); + inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1)); + additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + + //change to float16 + TensorInfo fp16TensorInfo(TensorShape({4}), DataType::Float16); + inputLayer1->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo); + inputLayer2->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo); + additionLayer->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo); + + // optimize the network + std::vector<Compute> backends = {Compute::GpuAcc}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + + // Loads it into the runtime. + NetworkId netId; + runtime->LoadNetwork(netId, std::move(optNet)); + + std::vector<Half> input1Data + { + 1.0_h, 2.0_h, 3.0_h, 4.0_h + }; + + std::vector<Half> input2Data + { + 100.0_h, 200.0_h, 300.0_h, 400.0_h + }; + + InputTensors inputTensors + { + {0,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input1Data.data())}, + {1,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input2Data.data())} + }; + + std::vector<Half> outputData(input1Data.size()); + OutputTensors outputTensors + { + {0,Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} + }; + + // Does the inference. + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Checks the results. + BOOST_TEST(outputData == std::vector<Half>({ 101.0_h, 202.0_h, 303.0_h, 404.0_h})); // Add +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/backends/cl/test/OpenClTimerTest.cpp b/src/backends/cl/test/OpenClTimerTest.cpp new file mode 100644 index 0000000000..70ceac2a3f --- /dev/null +++ b/src/backends/cl/test/OpenClTimerTest.cpp @@ -0,0 +1,143 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#if (defined(__aarch64__)) || (defined(__x86_64__)) // disable test failing on FireFly/Armv7 + +#include <armnn/test/TensorHelpers.hpp> + +#include <backends/CpuTensorHandle.hpp> +#include <backends/WorkloadFactory.hpp> + +#include <backends/cl/ClContextControl.hpp> +#include <backends/cl/ClWorkloadFactory.hpp> +#include <backends/cl/OpenClTimer.hpp> + +#include <backends/test/TensorCopyUtils.hpp> +#include <backends/test/WorkloadTestUtils.hpp> + +#include <arm_compute/runtime/CL/CLScheduler.h> + +#include <boost/format.hpp> +#include <boost/test/unit_test.hpp> + +#include <iostream> + +using namespace armnn; + +struct OpenClFixture +{ + // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case. + // NOTE: Profiling needs to be enabled in ClContextControl to be able to obtain execution + // times from OpenClTimer. + OpenClFixture() : m_ClContextControl(nullptr, true) {} + ~OpenClFixture() {} + + ClContextControl m_ClContextControl; +}; + +BOOST_FIXTURE_TEST_SUITE(OpenClTimerBatchNorm, OpenClFixture) +using FactoryType = ClWorkloadFactory; + +BOOST_AUTO_TEST_CASE(OpenClTimerBatchNorm) +{ + ClWorkloadFactory workloadFactory; + + const unsigned int width = 2; + const unsigned int height = 3; + const unsigned int channels = 2; + const unsigned int num = 1; + int32_t qOffset = 0; + float qScale = 0.f; + + TensorInfo inputTensorInfo({num, channels, height, width}, GetDataType<float>()); + TensorInfo outputTensorInfo({num, channels, height, width}, GetDataType<float>()); + TensorInfo tensorInfo({channels}, GetDataType<float>()); + + // Set quantization parameters if the requested type is a quantized type. + if(IsQuantizedType<float>()) + { + inputTensorInfo.SetQuantizationScale(qScale); + inputTensorInfo.SetQuantizationOffset(qOffset); + outputTensorInfo.SetQuantizationScale(qScale); + outputTensorInfo.SetQuantizationOffset(qOffset); + tensorInfo.SetQuantizationScale(qScale); + tensorInfo.SetQuantizationOffset(qOffset); + } + + auto input = MakeTensor<float, 4>(inputTensorInfo, + QuantizedVector<float>(qScale, qOffset, + { + 1.f, 4.f, + 4.f, 2.f, + 1.f, 6.f, + + 1.f, 1.f, + 4.f, 1.f, + -2.f, 4.f + })); + // these values are per-channel of the input + auto mean = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, -2})); + auto variance = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {4, 9})); + auto beta = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, 2})); + auto gamma = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {2, 1})); + + std::unique_ptr<ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + BatchNormalizationQueueDescriptor data; + WorkloadInfo info; + ScopedCpuTensorHandle meanTensor(tensorInfo); + ScopedCpuTensorHandle varianceTensor(tensorInfo); + ScopedCpuTensorHandle betaTensor(tensorInfo); + ScopedCpuTensorHandle gammaTensor(tensorInfo); + + AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]); + AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]); + AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]); + AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]); + + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + data.m_Mean = &meanTensor; + data.m_Variance = &varianceTensor; + data.m_Beta = &betaTensor; + data.m_Gamma = &gammaTensor; + data.m_Parameters.m_Eps = 0.0f; + + // for each channel: + // substract mean, divide by standard deviation (with an epsilon to avoid div by 0) + // multiply by gamma and add beta + std::unique_ptr<IWorkload> workload = workloadFactory.CreateBatchNormalization(data, info); + + inputHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]); + + OpenClTimer openClTimer; + + BOOST_CHECK_EQUAL(openClTimer.GetName(), "OpenClKernelTimer"); + + //Start the timer + openClTimer.Start(); + + //Execute the workload + workload->Execute(); + + //Stop the timer + openClTimer.Stop(); + + BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().size(), 1); + + BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().front().m_Name, + "OpenClKernelTimer/0: batchnormalization_layer_nchw GWS[1,3,2]"); + + BOOST_CHECK(openClTimer.GetMeasurements().front().m_Value > 0); + +} + +BOOST_AUTO_TEST_SUITE_END() + +#endif //aarch64 or x86_64
\ No newline at end of file diff --git a/src/backends/cl/workloads/ClWorkloadUtils.hpp b/src/backends/cl/workloads/ClWorkloadUtils.hpp index af4ccd0bb8..c765c63dce 100644 --- a/src/backends/cl/workloads/ClWorkloadUtils.hpp +++ b/src/backends/cl/workloads/ClWorkloadUtils.hpp @@ -5,11 +5,11 @@ #pragma once #include <armnnUtils/Half.hpp> + #include <backends/aclCommon/ArmComputeTensorUtils.hpp> +#include <backends/cl/OpenClTimer.hpp> #include <backends/CpuTensorHandle.hpp> -#include "OpenClTimer.hpp" - #define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \ ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \ name, \ |