diff options
author | telsoa01 <telmo.soares@arm.com> | 2018-08-31 09:22:23 +0100 |
---|---|---|
committer | telsoa01 <telmo.soares@arm.com> | 2018-08-31 09:22:23 +0100 |
commit | c577f2c6a3b4ddb6ba87a882723c53a248afbeba (patch) | |
tree | bd7d4c148df27f8be6649d313efb24f536b7cf34 /src/armnn/backends/ClWorkloads | |
parent | 4c7098bfeab1ffe1cdc77f6c15548d3e73274746 (diff) | |
download | armnn-c577f2c6a3b4ddb6ba87a882723c53a248afbeba.tar.gz |
Release 18.08
Diffstat (limited to 'src/armnn/backends/ClWorkloads')
69 files changed, 1411 insertions, 348 deletions
diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp index fb5d78425e..f072549cbc 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp @@ -9,10 +9,31 @@ namespace armnn { +arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + const arm_compute::ActivationLayerInfo activationLayerInfo = + ConvertActivationDescriptorToAclActivationLayerInfo(descriptor); + + if (input.GetDataType() == DataType::QuantisedAsymm8 && + activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, + "CL: Logistic Activations unsupported with QAsymm8 data type."}; + } + + return arm_compute::CLActivationLayer::validate(&aclInput, + &aclOutput, + activationLayerInfo); +} ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ActivationQueueDescriptor>(descriptor, info) + : FloatWorkload<ActivationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClActivationFloat32Workload", 1, 1); @@ -26,7 +47,7 @@ ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDe void ClActivationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationFloat32Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp index 9bab4202be..9fbfe95856 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp @@ -9,9 +9,12 @@ namespace armnn { +arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor); -// Activation layer execution -class ClActivationFloat32Workload : public Float32Workload<ActivationQueueDescriptor> +// Activation layer execution. +class ClActivationFloat32Workload : public FloatWorkload<ActivationQueueDescriptor> { public: ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp index 3671dd7187..75ab3d0691 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp @@ -6,6 +6,7 @@ #include "ClActivationUint8Workload.hpp" #include "backends/ClLayerSupport.hpp" +#include "backends/ArmComputeUtils.hpp" #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn @@ -15,15 +16,8 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri const WorkloadInfo& info) : Uint8Workload<ActivationQueueDescriptor>(descriptor, info) { - - std::string reasonIfUnsupported; - if (!IsClActivationUint8Supported(&reasonIfUnsupported, m_Data.m_Parameters)) - { - throw InvalidArgumentException(reasonIfUnsupported); - } - - // Only BoundedReLu is supported (see IsClActivationUint8Supported) - arm_compute::ActivationLayerInfo layerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function); + arm_compute::ActivationLayerInfo layerInfo(activation, m_Data.m_Parameters.m_A, m_Data.m_Parameters.m_B); @@ -37,7 +31,7 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri void ClActivationUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationUint8Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp index 3a9cceb298..449b2d56c5 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -// Activation layer execution +// Activation layer execution. class ClActivationUint8Workload : public Uint8Workload<ActivationQueueDescriptor> { public: diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp new file mode 100644 index 0000000000..5dd7bb323a --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp @@ -0,0 +1,71 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClAdditionBaseWorkload.hpp" + +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +template <armnn::DataType... T> +ClAdditionBaseWorkload<T...>::ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor, + const WorkloadInfo& info) + : TypedWorkload<AdditionQueueDescriptor, T...>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClAdditionBaseWorkload", 2, 1); + + arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[1])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor(); + m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy); +} + +template <armnn::DataType... T> +void ClAdditionBaseWorkload<T...>::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionBaseWorkload_Execute"); + m_Layer.run(); +} + +bool ClAdditionValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input0.GetDataType() == DataType::QuantisedAsymm8) + { + // Reject quantised addition for the moment (COMPMID-1385) + *reasonIfUnsupported = "Quantised Addition not yet supported"; + return false; + } + + const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLArithmeticAddition::validate(&aclInput0Info, + &aclInput1Info, + &aclOutputInfo, + g_AclConvertPolicy); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return supported; +} + +} //namespace armnn + +template class armnn::ClAdditionBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>; +template class armnn::ClAdditionBaseWorkload<armnn::DataType::QuantisedAsymm8>; diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp new file mode 100644 index 0000000000..fba8a0d457 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp @@ -0,0 +1,29 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +template <armnn::DataType... dataTypes> +class ClAdditionBaseWorkload : public TypedWorkload<AdditionQueueDescriptor, dataTypes...> +{ +public: + ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); + + void Execute() const override; + +private: + mutable arm_compute::CLArithmeticAddition m_Layer; +}; + +bool ClAdditionValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + std::string* reasonIfUnsupported); +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp index 153167f172..b69593f5f5 100644 --- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp @@ -13,45 +13,10 @@ namespace armnn { using namespace armcomputetensorutils; -ClAdditionFloat32Workload::ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, - const WorkloadInfo& info) - : Float32Workload<AdditionQueueDescriptor>(descriptor, info) -{ - m_Data.ValidateInputsOutputs("ClAdditionFloat32Workload", 2, 1); - - arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor(); - arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input0, &input1, &output, ms_AclConvertPolicy); -} - void ClAdditionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClAdditionFloat32Workload_Execute"); - m_Layer.run(); -} - -bool ClAdditionFloat32Workload::IsSupported(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output, - std::string* reasonIfUnsupported) -{ - const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); - const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); - const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); - - const arm_compute::Status aclStatus = decltype(m_Layer)::validate(&aclInput0Info, - &aclInput1Info, - &aclOutputInfo, - ms_AclConvertPolicy); - - const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); - if (!supported && reasonIfUnsupported) - { - *reasonIfUnsupported = aclStatus.error_description(); - } - - return supported; + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionFloat32Workload_Execute"); + ClAdditionBaseWorkload::Execute(); } -} //namespace armnn
\ No newline at end of file +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp index 37e50c2c86..7eac485cfe 100644 --- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp @@ -5,26 +5,16 @@ #pragma once -#include "backends/ClWorkloadUtils.hpp" +#include "ClAdditionBaseWorkload.hpp" namespace armnn { -class ClAdditionFloat32Workload : public Float32Workload<AdditionQueueDescriptor> +class ClAdditionFloat32Workload : public ClAdditionBaseWorkload<DataType::Float16, DataType::Float32> { public: - ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); - + using ClAdditionBaseWorkload<DataType::Float16, DataType::Float32>::ClAdditionBaseWorkload; void Execute() const override; - - static bool IsSupported(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output, - std::string* reasonIfUnsupported); - -private: - mutable arm_compute::CLArithmeticAddition m_Layer; - static constexpr arm_compute::ConvertPolicy ms_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; }; -} //namespace armnn
\ No newline at end of file +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp new file mode 100644 index 0000000000..a72ceca471 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp @@ -0,0 +1,18 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClAdditionUint8Workload.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +void ClAdditionUint8Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionUint8Workload_Execute"); + ClAdditionBaseWorkload::Execute(); +} + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp new file mode 100644 index 0000000000..73ff287e7e --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp @@ -0,0 +1,20 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "ClAdditionBaseWorkload.hpp" + +namespace armnn +{ + +class ClAdditionUint8Workload : public ClAdditionBaseWorkload<DataType::QuantisedAsymm8> +{ +public: + using ClAdditionBaseWorkload<DataType::QuantisedAsymm8>::ClAdditionBaseWorkload; + void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp index 4b72d92d72..e0bc365053 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp @@ -4,17 +4,19 @@ // #include "ClBaseConstantWorkload.hpp" +#include "backends/ArmComputeTensorUtils.hpp" #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" +#include "Half.hpp" namespace armnn { -template class ClBaseConstantWorkload<DataType::Float32>; +template class ClBaseConstantWorkload<DataType::Float16, DataType::Float32>; template class ClBaseConstantWorkload<DataType::QuantisedAsymm8>; -template<armnn::DataType dataType> -void ClBaseConstantWorkload<dataType>::Execute() const +template<armnn::DataType... dataTypes> +void ClBaseConstantWorkload<dataTypes...>::Execute() const { // The intermediate tensor held by the corresponding layer output handler can be initialised with the given data // on the first inference, then reused for subsequent inferences. @@ -26,15 +28,21 @@ void ClBaseConstantWorkload<dataType>::Execute() const BOOST_ASSERT(data.m_LayerOutput != nullptr); arm_compute::CLTensor& output = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetTensor(); + arm_compute::DataType computeDataType = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetDataType(); - switch (dataType) + switch (computeDataType) { - case DataType::Float32: + case arm_compute::DataType::F16: + { + CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<Half>(), output); + break; + } + case arm_compute::DataType::F32: { CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<float>(), output); break; } - case DataType::QuantisedAsymm8: + case arm_compute::DataType::QASYMM8: { CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<uint8_t>(), output); break; diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp index 660842f375..7ad7bb93ca 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp @@ -9,12 +9,12 @@ namespace armnn { -template <armnn::DataType DataType> -class ClBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataType> +template <armnn::DataType... DataTypes> +class ClBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataTypes...> { public: ClBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload<ConstantQueueDescriptor, DataType>(descriptor, info) + : TypedWorkload<ConstantQueueDescriptor, DataTypes...>(descriptor, info) , m_RanOnce(false) { } diff --git a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp index 7542c62b47..531e32961b 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp @@ -10,16 +10,16 @@ namespace armnn { -// Base class template providing an implementation of the Merger layer common to all data types -template <armnn::DataType DataType> -class ClBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataType> +// Base class template providing an implementation of the Merger layer common to all data types. +template <armnn::DataType... DataTypes> +class ClBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataTypes...> { public: - using TypedWorkload<MergerQueueDescriptor, DataType>::TypedWorkload; + using TypedWorkload<MergerQueueDescriptor, DataTypes...>::TypedWorkload; void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp index fef841ced2..8e4f10f9fd 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp @@ -10,16 +10,16 @@ namespace armnn { -// Base class template providing an implementation of the Splitter layer common to all data types -template <armnn::DataType DataType> -class ClBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataType> +// Base class template providing an implementation of the Splitter layer common to all data types. +template <armnn::DataType... DataTypes> +class ClBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataTypes...> { public: - using TypedWorkload<SplitterQueueDescriptor, DataType>::TypedWorkload; + using TypedWorkload<SplitterQueueDescriptor, DataTypes...>::TypedWorkload; void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp index dabd495d59..1849c5d411 100644 --- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp @@ -7,36 +7,88 @@ #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClLayerSupport.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor &desc) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean); + const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var); + const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta); + const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma); + + return arm_compute::CLBatchNormalizationLayer::validate(&aclInputInfo, + &aclOutputInfo, + &aclMeanInfo, + &aclVarInfo, + &aclBetaInfo, + &aclGammaInfo, + desc.m_Eps); +} + ClBatchNormalizationFloat32Workload::ClBatchNormalizationFloat32Workload( const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<BatchNormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info) { - BuildArmComputeTensor(m_Mean, m_Data.m_Mean->GetTensorInfo()); - BuildArmComputeTensor(m_Variance, m_Data.m_Variance->GetTensorInfo()); - BuildArmComputeTensor(m_Gamma, m_Data.m_Gamma->GetTensorInfo()); - BuildArmComputeTensor(m_Beta, m_Data.m_Beta->GetTensorInfo()); + m_Mean = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo()); + + m_Variance = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo()); + + m_Gamma = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo()); + + m_Beta = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo()); m_Data.ValidateInputsOutputs("ClBatchNormalizationFloat32Workload", 1, 1); arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input, &output, &m_Mean, &m_Variance, &m_Beta, &m_Gamma, m_Data.m_Parameters.m_Eps); - InitialiseArmComputeClTensorData(m_Mean, m_Data.m_Mean->GetConstTensor<float>()); - InitialiseArmComputeClTensorData(m_Variance, m_Data.m_Variance->GetConstTensor<float>()); - InitialiseArmComputeClTensorData(m_Beta, m_Data.m_Beta->GetConstTensor<float>()); - InitialiseArmComputeClTensorData(m_Gamma, m_Data.m_Gamma->GetConstTensor<float>()); + m_Layer.configure(&input, + &output, + m_Mean.get(), + m_Variance.get(), + m_Beta.get(), + m_Gamma.get(), + m_Data.m_Parameters.m_Eps); + + InitializeArmComputeClTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean); + InitializeArmComputeClTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance); + InitializeArmComputeClTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta); + InitializeArmComputeClTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma); + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_Layer.prepare(); + FreeUnusedTensors(); } void ClBatchNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClBatchNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClBatchNormalizationFloat32Workload_Execute"); m_Layer.run(); } +void ClBatchNormalizationFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_Mean); + FreeTensorIfUnused(m_Variance); + FreeTensorIfUnused(m_Gamma); + FreeTensorIfUnused(m_Beta); +} + } //namespace armnn
\ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp index ddbd0f05c0..a45614a284 100644 --- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp @@ -10,21 +10,31 @@ namespace armnn { -class ClBatchNormalizationFloat32Workload : public Float32Workload<BatchNormalizationQueueDescriptor> +arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor& desc); + +class ClBatchNormalizationFloat32Workload : public FloatWorkload<BatchNormalizationQueueDescriptor> { public: ClBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); - using Float32Workload<BatchNormalizationQueueDescriptor>::Float32Workload; + using FloatWorkload<BatchNormalizationQueueDescriptor>::FloatWorkload; void Execute() const override; private: mutable arm_compute::CLBatchNormalizationLayer m_Layer; - arm_compute::CLTensor m_Mean; - arm_compute::CLTensor m_Variance; - arm_compute::CLTensor m_Gamma; - arm_compute::CLTensor m_Beta; + std::unique_ptr<arm_compute::CLTensor> m_Mean; + std::unique_ptr<arm_compute::CLTensor> m_Variance; + std::unique_ptr<arm_compute::CLTensor> m_Gamma; + std::unique_ptr<arm_compute::CLTensor> m_Beta; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp index 99880d68a7..58594999a8 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp @@ -9,7 +9,7 @@ namespace armnn void ClConstantFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantFloat32Workload_Execute"); ClBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp index 5f86d3b2b6..11c3fda8db 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp @@ -9,10 +9,10 @@ namespace armnn { -class ClConstantFloat32Workload : public ClBaseConstantWorkload<DataType::Float32> +class ClConstantFloat32Workload : public ClBaseConstantWorkload<DataType::Float16, DataType::Float32> { public: - using ClBaseConstantWorkload<DataType::Float32>::ClBaseConstantWorkload; + using ClBaseConstantWorkload<DataType::Float16, DataType::Float32>::ClBaseConstantWorkload; void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp index 078d4261fa..82ce436557 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp @@ -9,7 +9,7 @@ namespace armnn void ClConstantUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantUint8Workload_Execute"); ClBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp new file mode 100644 index 0000000000..4914be78bc --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp @@ -0,0 +1,64 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClConvertFp16ToFp32Workload.hpp" +#include "backends/ClTensorHandle.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +ClConvertFp16ToFp32Workload::ClConvertFp16ToFp32Workload( + const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info) : + Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClConvertFp16ToFp32Workload", 1, 1); + + arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor(); + + m_Layer.configure(&input, &output, g_AclConvertPolicy, 0); +} + +void ClConvertFp16ToFp32Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp16ToFp32Workload_Execute"); + m_Layer.run(); +} + +arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input.GetDataType() != DataType::Float16) + { + *reasonIfUnsupported = "Input should be Float16"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + if (output.GetDataType() != DataType::Float32) + { + *reasonIfUnsupported = "Output should be Float32"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate( + &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return aclStatus; +} + + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp new file mode 100644 index 0000000000..36ccbb7144 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +class ClConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor> +{ +public: + + ClConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + mutable arm_compute::CLDepthConvertLayer m_Layer; +}; + +arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported); + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp new file mode 100644 index 0000000000..19e064351f --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp @@ -0,0 +1,64 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClConvertFp32ToFp16Workload.hpp" +#include "backends/ClTensorHandle.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +ClConvertFp32ToFp16Workload::ClConvertFp32ToFp16Workload( + const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info) : + Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClConvertFp32ToFp16Workload", 1, 1); + + arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor(); + + m_Layer.configure(&input, &output, g_AclConvertPolicy, 0); +} + +void ClConvertFp32ToFp16Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp32ToFp16Workload_Execute"); + m_Layer.run(); +} + +arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input.GetDataType() != DataType::Float32) + { + *reasonIfUnsupported = "Input should be Float32"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + if (output.GetDataType() != DataType::Float16) + { + *reasonIfUnsupported = "Output should be Float16"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate( + &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return aclStatus; +} + + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp new file mode 100644 index 0000000000..02a442dabc --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +class ClConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor> +{ +public: + + ClConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + mutable arm_compute::CLDepthConvertLayer m_Layer; +}; + +arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported); + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp index d7aef3d223..9ac31df5c1 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp @@ -15,13 +15,15 @@ using namespace armcomputetensorutils; ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<Convolution2dQueueDescriptor>(descriptor, info) + : FloatWorkload<Convolution2dQueueDescriptor>(descriptor, info) , m_ConvolutionLayer(memoryManager) { - // todo: check tensor shapes match + // todo: check tensor shapes match. const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - BuildArmComputeTensor(m_KernelTensor, weightInfo); + + m_KernelTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY, @@ -31,11 +33,10 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution m_Data.m_Parameters.m_PadBottom, arm_compute::DimensionRoundingType::FLOOR); - arm_compute::CLTensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClConvolution2dFloat32Workload", 1, 1); @@ -44,24 +45,35 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); m_ConvolutionLayer.configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_ConvolutionLayer.prepare(); + FreeUnusedTensors(); } void ClConvolution2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dFloat32Workload_Execute"); m_ConvolutionLayer.run(); } +void ClConvolution2dFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp index 4cf73c89cc..51c21aec32 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class ClConvolution2dFloat32Workload : public Float32Workload<Convolution2dQueueDescriptor> +class ClConvolution2dFloat32Workload : public FloatWorkload<Convolution2dQueueDescriptor> { public: ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, @@ -22,10 +22,12 @@ public: void Execute() const override; private: - mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; + mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_KernelTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp index cf419e752e..a78d7fb4a2 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp @@ -18,10 +18,11 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu : Uint8Workload<Convolution2dQueueDescriptor>(descriptor, info) , m_ConvolutionLayer(memoryManager) { - // todo: check tensor shapes match const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - BuildArmComputeTensor(m_KernelTensor, weightInfo); + + m_KernelTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY, @@ -31,11 +32,10 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu m_Data.m_Parameters.m_PadBottom, arm_compute::DimensionRoundingType::FLOOR); - arm_compute::CLTensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClConvolution2dUint8Workload", 1, 1); @@ -44,25 +44,36 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); m_ConvolutionLayer.configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>()); + InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>()); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<int32_t>()); + InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor<int32_t>()); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_ConvolutionLayer.prepare(); + FreeUnusedTensors(); } void ClConvolution2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dUint8Workload_Execute"); m_ConvolutionLayer.run(); } +void ClConvolution2dUint8Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp index d4d3908c80..7d9eb76ba1 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp @@ -22,10 +22,12 @@ public: void Execute() const override; private: - mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; + mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_KernelTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp new file mode 100644 index 0000000000..cfb8485039 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp @@ -0,0 +1,122 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + +#include "TypeUtils.hpp" + +#include "backends/ArmComputeUtils.hpp" +#include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" + +namespace armnn +{ + +using namespace armcomputetensorutils; + +arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiasesInfo; + arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiasesInfo = BuildArmComputeTensorInfo(biases); + optionalAclBiasesInfo = &aclBiasesInfo; + } + + const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor); + const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + + return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo, + &aclWeightsInfo, + optionalAclBiasesInfo, + &aclOutputInfo, + aclPadStrideInfo, + aclDepthMultiplier); +} + +template<armnn::DataType... dataTypes> +ClDepthwiseConvolutionBaseWorkload<dataTypes...>::ClDepthwiseConvolutionBaseWorkload( + const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info) + : TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>(descriptor, info) +{ + auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); + + m_KernelTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); + + if (m_Data.m_Parameters.m_BiasEnabled) + { + m_BiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); + } + + arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, + m_Data.m_Parameters.m_StrideY, + m_Data.m_Parameters.m_PadLeft, + m_Data.m_Parameters.m_PadRight, + m_Data.m_Parameters.m_PadTop, + m_Data.m_Parameters.m_PadBottom, + arm_compute::DimensionRoundingType::FLOOR); + + std::string name = std::string("ClDepthwiseConvolution") + + GetDataTypeName(m_Data.m_Weight->GetTensorInfo().GetDataType()) + "Workload"; + m_Data.ValidateInputsOutputs(name, 1, 1); + + arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); + + const unsigned int depthMultiplier = weightInfo.GetShape()[0]; + + //Check for optimisation opportunities. + bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3); + if (use3x3Optimisation) + { + m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>(); + static_cast<arm_compute::CLDepthwiseConvolutionLayer3x3*>(m_DepthwiseConvolutionLayer.get())->configure( + &input, + m_KernelTensor.get(), + m_BiasTensor.get(), + &output, + padStrideInfo, + depthMultiplier); + } + else + { + m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>(); + static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_DepthwiseConvolutionLayer.get())->configure( + &input, + m_KernelTensor.get(), + m_BiasTensor.get(), + &output, + padStrideInfo, + depthMultiplier); + } + + BOOST_ASSERT(m_DepthwiseConvolutionLayer); +} + +template<armnn::DataType... dataTypes> +void ClDepthwiseConvolutionBaseWorkload<dataTypes...>::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + +// Generate known implementations for linker +template class ClDepthwiseConvolutionBaseWorkload<DataType::Float16, DataType::Float32>; +template class ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8>; + +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp new file mode 100644 index 0000000000..a879efc89e --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp @@ -0,0 +1,37 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases); + +template<armnn::DataType... dataTypes> +class ClDepthwiseConvolutionBaseWorkload : public TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...> +{ +public: + using TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>::m_Data; + + ClDepthwiseConvolutionBaseWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); + +protected: + std::unique_ptr<arm_compute::IFunction> m_DepthwiseConvolutionLayer; + + std::unique_ptr<arm_compute::CLTensor> m_KernelTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasTensor; + + void FreeUnusedTensors(); +}; + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp index f31c73bc60..96d97ad4ea 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp @@ -4,8 +4,8 @@ // #include "ClDepthwiseConvolutionFloat32Workload.hpp" -#include "ClDepthwiseConvolutionHelper.hpp" -#include "backends/ClTensorHandle.hpp" + +#include "backends/ClWorkloadUtils.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn @@ -14,17 +14,25 @@ namespace armnn ClDepthwiseConvolutionFloat32Workload::ClDepthwiseConvolutionFloat32Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info) + : ClDepthwiseConvolutionBaseWorkload(descriptor, info) { - InitClDepthwiseConvolutionWorkload(*this); + InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); + + if (m_BiasTensor) + { + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); + } + + m_DepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void ClDepthwiseConvolutionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionFloat32Workload_Execute"); - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionFloat32Workload_Execute"); + BOOST_ASSERT(m_DepthwiseConvolutionLayer); - m_pDepthwiseConvolutionLayer->run(); + m_DepthwiseConvolutionLayer->run(); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp index 8711f0c515..669fd928b5 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp @@ -5,29 +5,20 @@ #pragma once +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + #include "backends/ClWorkloadUtils.hpp" namespace armnn { -class ClDepthwiseConvolutionFloat32Workload : public Float32Workload<DepthwiseConvolution2dQueueDescriptor> +class ClDepthwiseConvolutionFloat32Workload : public ClDepthwiseConvolutionBaseWorkload<DataType::Float16, + DataType::Float32> { public: ClDepthwiseConvolutionFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; - -private: - typedef float KernelDataType; - typedef float BiasDataType; - - mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer; - - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; - - template <typename WorkloadType> - friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp deleted file mode 100644 index cd7115773d..0000000000 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp +++ /dev/null @@ -1,91 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// - -#pragma once - -#include <armnn/TypesUtils.hpp> -#include "backends/ClLayerSupport.hpp" -#include "backends/ArmComputeTensorUtils.hpp" -#include "backends/ClTensorHandle.hpp" - -namespace armnn -{ - -template <typename WorkloadType> -void InitClDepthwiseConvolutionWorkload(WorkloadType& workload) -{ - using T = typename WorkloadType::KernelDataType; - using B = typename WorkloadType::BiasDataType; - - auto& m_Data = workload.GetData(); - auto& m_KernelTensor = workload.m_KernelTensor; - auto& m_BiasTensor = workload.m_BiasTensor; - auto& m_pDepthwiseConvolutionLayer = workload.m_pDepthwiseConvolutionLayer; - - auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); - - std::string reasonIfUnsupported; - if (!IsClDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo)) - { - throw UnimplementedException(reasonIfUnsupported); - } - - armcomputetensorutils::BuildArmComputeTensor(m_KernelTensor, weightInfo); - - arm_compute::CLTensor* optionalBias = nullptr; - if (m_Data.m_Parameters.m_BiasEnabled) - { - armcomputetensorutils::BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; - } - - arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, - m_Data.m_Parameters.m_StrideY, - m_Data.m_Parameters.m_PadLeft, - m_Data.m_Parameters.m_PadRight, - m_Data.m_Parameters.m_PadTop, - m_Data.m_Parameters.m_PadBottom, - arm_compute::DimensionRoundingType::FLOOR); - - std::string name = std::string("ClDepthwiseConvolution") + GetDataTypeName(GetDataType<T>()) + "Workload"; - m_Data.ValidateInputsOutputs(name, 1, 1); - - arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - - //Check for optimisation opportunities. - bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3); - if (use3x3Optimisation) - { - m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>(); - static_cast<arm_compute::CLDepthwiseConvolutionLayer3x3*>(m_pDepthwiseConvolutionLayer.get())->configure( - &input, - &m_KernelTensor, - optionalBias, - &output, - padStrideInfo); - } - else - { - m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>(); - static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_pDepthwiseConvolutionLayer.get())->configure( - &input, - &m_KernelTensor, - optionalBias, - &output, - padStrideInfo); - } - - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->template GetConstTensor<T>()); - - if (optionalBias) - { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->template GetConstTensor<B>()); - } -} - -} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp index 7e7c488c74..4852ce8bf9 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp @@ -4,28 +4,34 @@ // #include "ClDepthwiseConvolutionUint8Workload.hpp" -#include "ClDepthwiseConvolutionHelper.hpp" -#include "backends/ClTensorHandle.hpp" + #include "backends/CpuTensorHandle.hpp" namespace armnn { - ClDepthwiseConvolutionUint8Workload::ClDepthwiseConvolutionUint8Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Uint8Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info) + : ClDepthwiseConvolutionBaseWorkload(descriptor, info) { - InitClDepthwiseConvolutionWorkload(*this); + InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<uint8_t>()); + + if (m_BiasTensor) + { + InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>()); + } + + m_DepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void ClDepthwiseConvolutionUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionUint8Workload_Execute"); - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionUint8Workload_Execute"); + BOOST_ASSERT(m_DepthwiseConvolutionLayer); - m_pDepthwiseConvolutionLayer->run(); + m_DepthwiseConvolutionLayer->run(); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp index ee09ff3e58..a4277d405f 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp @@ -5,29 +5,19 @@ #pragma once +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + #include "backends/ClWorkloadUtils.hpp" namespace armnn { -class ClDepthwiseConvolutionUint8Workload : public Uint8Workload<DepthwiseConvolution2dQueueDescriptor> +class ClDepthwiseConvolutionUint8Workload : public ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8> { public: ClDepthwiseConvolutionUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; - -private: - typedef uint8_t KernelDataType; - typedef int32_t BiasDataType; - - mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer; - - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; - - template <typename WorkloadType> - friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp index 882da50855..da71c50305 100644 --- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn { ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<FloorQueueDescriptor>(descriptor, info) + : FloatWorkload<FloorQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClFloorFloat32Workload", 1, 1); @@ -22,7 +22,7 @@ ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descr void ClFloorFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFloorFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClFloorFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp index 532dd29884..bd7f3032fc 100644 --- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClFloorFloat32Workload : public Float32Workload<FloorQueueDescriptor> +class ClFloorFloat32Workload : public FloatWorkload<FloorQueueDescriptor> { public: ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp index 5dfab9cbbd..5014dd27ca 100644 --- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp @@ -7,47 +7,89 @@ #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ArmComputeUtils.hpp" +#include "backends/ClLayerSupport.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiases; + arm_compute::TensorInfo *optionalAclBiases = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiases = BuildArmComputeTensorInfo(biases); + optionalAclBiases = &aclBiases; + } + + const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo = + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor); + + return arm_compute::CLFullyConnectedLayer::validate(&aclInput, + &aclWeights, + optionalAclBiases, + &aclOutput, + fullyConnectedLayerInfo); +} + ClFullyConnectedFloat32Workload::ClFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<FullyConnectedQueueDescriptor>(descriptor, info) - , m_FullyConnected(memoryManager) + : FloatWorkload<FullyConnectedQueueDescriptor>(descriptor, info) + , m_FullyConnectedLayer(memoryManager) { + m_WeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - BuildArmComputeTensor(m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - - arm_compute::CLTensor* optionalBiasTensor = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBiasTensor = &m_BiasesTensor; + m_BiasesTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClFullyConnectedFloat32Workload", 1, 1); arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); + // Construct - m_FullyConnected.configure( - &input, &m_WeightsTensor, optionalBiasTensor, &output, m_Data.m_Parameters.m_TransposeWeightMatrix); + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix; + m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info); // Allocate - InitialiseArmComputeClTensorData(m_WeightsTensor, m_Data.m_Weight->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight); - if (optionalBiasTensor) + if (m_BiasesTensor) { - InitialiseArmComputeClTensorData(*optionalBiasTensor, m_Data.m_Bias->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_FullyConnectedLayer.prepare(); + FreeUnusedTensors(); } void ClFullyConnectedFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFullyConnectedFloat32Workload_Execute"); - m_FullyConnected.run(); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClFullyConnectedFloat32Workload_Execute"); + m_FullyConnectedLayer.run(); +} + +void ClFullyConnectedFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_WeightsTensor); + FreeTensorIfUnused(m_BiasesTensor); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp index c8d1227bda..f580e580c6 100644 --- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp @@ -14,20 +14,29 @@ namespace armnn { -class ClFullyConnectedFloat32Workload : public armnn::Float32Workload<armnn::FullyConnectedQueueDescriptor> +arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor); + +class ClFullyConnectedFloat32Workload : public armnn::FloatWorkload<armnn::FullyConnectedQueueDescriptor> { public: ClFullyConnectedFloat32Workload(const armnn::FullyConnectedQueueDescriptor& descriptor, const armnn::WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager); - using armnn::Float32Workload<armnn::FullyConnectedQueueDescriptor>::m_Data; + using armnn::FloatWorkload<armnn::FullyConnectedQueueDescriptor>::m_Data; void Execute() const override; private: - mutable arm_compute::CLFullyConnectedLayer m_FullyConnected; - arm_compute::CLTensor m_WeightsTensor; - arm_compute::CLTensor m_BiasesTensor; + mutable arm_compute::CLFullyConnectedLayer m_FullyConnectedLayer; + + std::unique_ptr<arm_compute::CLTensor> m_WeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasesTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp index e15db74ec9..628e38d3da 100644 --- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp @@ -12,9 +12,21 @@ namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + + arm_compute::NormalizationLayerInfo normalizationInfo = + CreateAclNormalizationLayerInfoForL2Normalization(input); + + return arm_compute::CLNormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo); +} + ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<L2NormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClL2NormalizationFloat32Workload", 1, 1); @@ -25,7 +37,7 @@ ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2Norma void ClL2NormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClL2NormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClL2NormalizationFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp index 848803e2f0..bf898e31f7 100644 --- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp @@ -10,7 +10,10 @@ namespace armnn { -class ClL2NormalizationFloat32Workload : public Float32Workload<L2NormalizationQueueDescriptor> +arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +class ClL2NormalizationFloat32Workload : public FloatWorkload<L2NormalizationQueueDescriptor> { public: ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp new file mode 100644 index 0000000000..db5c303854 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp @@ -0,0 +1,405 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClLstmFloat32Workload.hpp" +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClLayerSupport.hpp" +#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" + +namespace armnn +{ +using namespace armcomputetensorutils; + +ClLstmFloat32Workload::ClLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info) + : FloatWorkload<LstmQueueDescriptor>(descriptor, info) +{ + arm_compute::LSTMParams<arm_compute::ICLTensor> lstm_param; + + // Basic parameters + m_InputToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo()); + + m_InputToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo()); + + m_InputToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo()); + + m_RecurrentToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo()); + + m_RecurrentToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo()); + + m_RecurrentToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo()); + + m_ForgetGateBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo()); + + m_CellBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo()); + + m_OutputGateBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo()); + + // for future reference: check the AndroidNN API for the logic here + if (!m_Data.m_Parameters.m_CifgEnabled) + { + m_InputToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo()); + + m_RecurrentToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo()); + + m_CellToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + if (m_Data.m_CellToInputWeights != nullptr) + { + BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo()); + } + + m_InputGateBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo()); + + lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(), + m_RecurrentToInputWeightsTensor.get(), + m_Data.m_CellToInputWeights != nullptr ? m_CellToInputWeightsTensor.get() : nullptr, + m_InputGateBiasTensor.get()); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + m_ProjectionWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo()); + + m_ProjectionBiasTensor = std::make_unique<arm_compute::CLTensor>(); + if (m_Data.m_ProjectionBias != nullptr) + { + BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo()); + } + + lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(), + m_Data.m_ProjectionBias != nullptr ? m_ProjectionBiasTensor.get() : nullptr); + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + m_CellToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo()); + + m_CellToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo()); + + lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get()); + } + + const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); + const arm_compute::ICLTensor& output_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor(); + const arm_compute::ICLTensor& cell_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[2])->GetTensor(); + + arm_compute::ICLTensor& output_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[1])->GetTensor(); + arm_compute::ICLTensor& cell_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[2])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[3])->GetTensor(); + + // Get the batch_size and the num_units from the cellStateIn dimensions + const TensorInfo& inputTensorInfo = info.m_InputTensorInfos[2]; + const unsigned int batch_size = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[0]); + const unsigned int num_units = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[1]); + + m_ScratchBuffer = std::make_unique<arm_compute::CLTensor>(); + if (m_Data.m_Parameters.m_CifgEnabled) + { + // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG + armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32); + BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1); + } + else + { + // scratch_buffer [num_units * 3, batch_size] without CIFG + armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32); + BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2); + } + + float cell_threshold = m_Data.m_Parameters.m_ClippingThresCell; + float projection_threshold = m_Data.m_Parameters.m_ClippingThresProj; + + // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations + arm_compute::ActivationLayerInfo activationLayerInfo; + if (m_Data.m_Parameters.m_ActivationFunc == 0) + { + // no activation, do nothing + } + else if (m_Data.m_Parameters.m_ActivationFunc == 1) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::RELU); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 3) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 4) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 6) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC); + } + else + { + throw armnn::Exception("Wrong Type of Activation Function!"); + } + + + m_LstmLayer.configure(&input, m_InputToForgetWeightsTensor.get(), m_InputToCellWeightsTensor.get(), + m_InputToOutputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(), + m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(), + m_ForgetGateBiasTensor.get(), m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(), + &output_state_in, &cell_state_in, m_ScratchBuffer.get(), &output_state_out, + &cell_state_out, &output, lstm_param, activationLayerInfo, + cell_threshold, projection_threshold); + + armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer); + + InitialiseArmComputeClTensorData(*m_InputToForgetWeightsTensor, + m_Data.m_InputToForgetWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_InputToCellWeightsTensor, + m_Data.m_InputToCellWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_InputToOutputWeightsTensor, + m_Data.m_InputToOutputWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToForgetWeightsTensor, + m_Data.m_RecurrentToForgetWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToCellWeightsTensor, + m_Data.m_RecurrentToCellWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToOutputWeightsTensor, + m_Data.m_RecurrentToOutputWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_ForgetGateBiasTensor, + m_Data.m_ForgetGateBias->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_CellBiasTensor, + m_Data.m_CellBias->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_OutputGateBiasTensor, + m_Data.m_OutputGateBias->GetConstTensor<float>()); + + if (!m_Data.m_Parameters.m_CifgEnabled) + { + InitialiseArmComputeClTensorData(*m_InputToInputWeightsTensor, + m_Data.m_InputToInputWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToInputWeightsTensor, + m_Data.m_RecurrentToInputWeights->GetConstTensor<float>()); + if (m_Data.m_CellToInputWeights != nullptr) + { + InitialiseArmComputeClTensorData(*m_CellToInputWeightsTensor, + m_Data.m_CellToInputWeights->GetConstTensor<float>()); + } + InitialiseArmComputeClTensorData(*m_InputGateBiasTensor, + m_Data.m_InputGateBias->GetConstTensor<float>()); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + InitialiseArmComputeClTensorData(*m_ProjectionWeightsTensor, + m_Data.m_ProjectionWeights->GetConstTensor<float>()); + if (m_Data.m_ProjectionBias != nullptr) + { + InitialiseArmComputeClTensorData(*m_ProjectionBiasTensor, + m_Data.m_ProjectionBias->GetConstTensor<float>()); + } + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + InitialiseArmComputeClTensorData(*m_CellToForgetWeightsTensor, + m_Data.m_CellToForgetWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_CellToOutputWeightsTensor, + m_Data.m_CellToOutputWeights->GetConstTensor<float>()); + } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_LstmLayer.prepare(); + FreeUnusedTensors(); +} + +void ClLstmFloat32Workload::Execute() const +{ + m_LstmLayer.run(); +} + +arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, + const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, + const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, + const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, + const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, + const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, + const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, + const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights) +{ + arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info; + + // The inputs and the outputs + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn); + const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn); + const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer); + const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut); + const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + // Basic parameters + const arm_compute::TensorInfo aclInputToForgetWeightsInfo = BuildArmComputeTensorInfo(inputToForgetWeights); + const arm_compute::TensorInfo aclInputToCellWeightsInfo = BuildArmComputeTensorInfo(inputToCellWeights); + const arm_compute::TensorInfo aclInputToOutputWeightsInfo = BuildArmComputeTensorInfo(inputToOutputWeights); + const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo + = BuildArmComputeTensorInfo(recurrentToForgetWeights); + const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo + = BuildArmComputeTensorInfo(recurrentToCellWeights); + const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo + = BuildArmComputeTensorInfo(recurrentToOutputWeights); + const arm_compute::TensorInfo aclForgetGateBiasInfo = BuildArmComputeTensorInfo(forgetGateBias); + const arm_compute::TensorInfo aclCellBiasInfo = BuildArmComputeTensorInfo(cellBias); + const arm_compute::TensorInfo aclOutputGateBiasInfo = BuildArmComputeTensorInfo(outputGateBias); + + arm_compute::TensorInfo aclInputToInputWeightsInfo; + arm_compute::TensorInfo aclRecurrentToInputWeightsInfo; + arm_compute::TensorInfo aclCellToInputWeightsInfo; + arm_compute::TensorInfo aclInputGateBiasInfo; + arm_compute::TensorInfo aclProjectionWeightsInfo; + arm_compute::TensorInfo aclProjectionBiasInfo; + arm_compute::TensorInfo aclCellToForgetWeightsInfo; + arm_compute::TensorInfo aclCellToOutputWeightsInfo; + + if (!descriptor.m_CifgEnabled) + { + armnn::TensorInfo inputToInputWInfo = *inputToInputWeights; + aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(inputToInputWInfo); + armnn::TensorInfo recurrentToInputWInfo = *recurrentToInputWeights; + aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(recurrentToInputWInfo); + + if (cellToInputWeights != nullptr) + { + armnn::TensorInfo cellToInputWInfo = *cellToInputWeights; + aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(cellToInputWInfo); + } + armnn::TensorInfo inputGateBiasInfo = *inputGateBias; + aclInputGateBiasInfo = BuildArmComputeTensorInfo(inputGateBiasInfo); + lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo, &aclRecurrentToInputWeightsInfo, + cellToInputWeights != nullptr ? &aclCellToInputWeightsInfo: nullptr, + &aclInputGateBiasInfo); + } + + if (descriptor.m_ProjectionEnabled) + { + const armnn::TensorInfo& projectionWInfo = *projectionWeights; + aclProjectionWeightsInfo = BuildArmComputeTensorInfo(projectionWInfo); + + if (projectionBias != nullptr) + { + const armnn::TensorInfo& projectionBiasInfo = *projectionBias; + aclProjectionBiasInfo = BuildArmComputeTensorInfo(projectionBiasInfo); + } + lstm_params_info.set_projection_params(&aclProjectionWeightsInfo, + projectionBias != nullptr ? &aclProjectionBiasInfo: nullptr); + } + + if (descriptor.m_PeepholeEnabled) + { + const armnn::TensorInfo& cellToForgetWInfo = *cellToForgetWeights; + aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(cellToForgetWInfo); + const armnn::TensorInfo& cellToOutputWInfo = *cellToOutputWeights; + aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(cellToOutputWInfo); + lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo); + } + + float cell_threshold = descriptor.m_ClippingThresCell; + float projection_threshold = descriptor.m_ClippingThresProj; + + // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations + arm_compute::ActivationLayerInfo activationLayerInfo; + if (descriptor.m_ActivationFunc == 0) + { + // no activation, do nothing + } + else if (descriptor.m_ActivationFunc == 1) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::RELU); + } + else if (descriptor.m_ActivationFunc == 3) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0); + } + else if (descriptor.m_ActivationFunc == 4) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0); + } + else if (descriptor.m_ActivationFunc == 6) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC); + } + else + { + throw armnn::Exception("Wrong Type of Activation Function!"); + } + + return arm_compute::CLLSTMLayer::validate(&aclInputInfo, &aclInputToForgetWeightsInfo, + &aclInputToCellWeightsInfo, + &aclInputToOutputWeightsInfo, + &aclRecurrentToForgetWeightsInfo, + &aclRecurrentToCellWeightsInfo, + &aclRecurrentToOutputWeightsInfo, + &aclForgetGateBiasInfo, + &aclCellBiasInfo, + &aclOutputGateBiasInfo, + &aclOutputStateInInfo, &aclCellStateInInfo, + &aclScratchBufferInfo, &aclOutputStateOutInfo, + &aclCellStateOutInfo, &aclOutputInfo, + lstm_params_info, activationLayerInfo, + cell_threshold, projection_threshold); +} + +void ClLstmFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_InputToInputWeightsTensor); + FreeTensorIfUnused(m_InputToForgetWeightsTensor); + FreeTensorIfUnused(m_InputToCellWeightsTensor); + FreeTensorIfUnused(m_InputToOutputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToInputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToForgetWeightsTensor); + FreeTensorIfUnused(m_RecurrentToCellWeightsTensor); + FreeTensorIfUnused(m_RecurrentToOutputWeightsTensor); + FreeTensorIfUnused(m_CellToInputWeightsTensor); + FreeTensorIfUnused(m_CellToForgetWeightsTensor); + FreeTensorIfUnused(m_CellToOutputWeightsTensor); + FreeTensorIfUnused(m_InputGateBiasTensor); + FreeTensorIfUnused(m_ForgetGateBiasTensor); + FreeTensorIfUnused(m_CellBiasTensor); + FreeTensorIfUnused(m_OutputGateBiasTensor); + FreeTensorIfUnused(m_ProjectionWeightsTensor); + FreeTensorIfUnused(m_ProjectionBiasTensor); + FreeTensorIfUnused(m_ScratchBuffer); +} + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp new file mode 100644 index 0000000000..e2358ad10d --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp @@ -0,0 +1,67 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class ClLstmFloat32Workload : public FloatWorkload<LstmQueueDescriptor> +{ +public: + ClLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info); + void Execute() const override; + +private: + mutable arm_compute::CLLSTMLayer m_LstmLayer; + + std::unique_ptr<arm_compute::CLTensor> m_InputToInputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputToForgetWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputToCellWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputToOutputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToInputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToForgetWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToCellWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToOutputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellToInputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellToForgetWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellToOutputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputGateBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_ForgetGateBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_OutputGateBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_ProjectionWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_ProjectionBiasTensor; + + std::unique_ptr<arm_compute::CLTensor> m_ScratchBuffer; + + void FreeUnusedTensors(); +}; + +arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor &descriptor, + const TensorInfo& inputToForgetWeights, + const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, + const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, + const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, + const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, + const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, + const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, + const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights); +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp index 4d2d708a0e..89e7690a36 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp @@ -11,7 +11,7 @@ namespace armnn void ClMergerFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerFloat32Workload_Execute"); ClBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp index 9808d30ccf..3cafa23c1e 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class ClMergerFloat32Workload : public ClBaseMergerWorkload<armnn::DataType::Float32> +class ClMergerFloat32Workload : public ClBaseMergerWorkload<DataType::Float16, DataType::Float32> { public: - using ClBaseMergerWorkload<armnn::DataType::Float32>::ClBaseMergerWorkload; + using ClBaseMergerWorkload<DataType::Float16, DataType::Float32>::ClBaseMergerWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp index 94a1d3c593..551135b7da 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp @@ -11,7 +11,7 @@ namespace armnn void ClMergerUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerUint8Workload_Execute"); ClBaseMergerWorkload<DataType::QuantisedAsymm8>::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp index 405d109aa1..7aa33146f3 100644 --- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp @@ -10,9 +10,29 @@ namespace armnn { +arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, + // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be + // ignored for F32 tensors. + return arm_compute::CLPixelWiseMultiplication::validate(&aclInput1, + &aclInput2, + &aclOutput, + 1.0f, + arm_compute::ConvertPolicy::SATURATE, + arm_compute::RoundingPolicy::TO_ZERO); +} + + ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<MultiplicationQueueDescriptor>(descriptor, info) + : FloatWorkload<MultiplicationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClMultiplicationFloat32Workload", 2, 1); @@ -30,9 +50,9 @@ ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const Multiplic void ClMultiplicationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMultiplicationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMultiplicationFloat32Workload_Execute"); - // Execute the layer + // Executes the layer. m_PixelWiseMultiplication.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp index 8e387118e8..0d6199047d 100644 --- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp @@ -9,12 +9,17 @@ namespace armnn { -class ClMultiplicationFloat32Workload : public Float32Workload<MultiplicationQueueDescriptor> + +arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output); + +class ClMultiplicationFloat32Workload : public FloatWorkload<MultiplicationQueueDescriptor> { public: ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info); - using Float32Workload<MultiplicationQueueDescriptor>::Float32Workload; + using FloatWorkload<MultiplicationQueueDescriptor>::FloatWorkload; void Execute() const override; private: diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp index a163ec2883..d23d6e11bd 100644 --- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp @@ -27,7 +27,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input, con ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<NormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClNormalizationFloat32Workload", 1, 1); @@ -42,7 +42,7 @@ ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const Normalizati void ClNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClNormalizationFloat32Workload_Execute"); m_NormalizationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp index cbd5fa92a9..e8ab0b9a18 100644 --- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp @@ -14,7 +14,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const NormalizationDescriptor& descriptor); -class ClNormalizationFloat32Workload : public Float32Workload<NormalizationQueueDescriptor> +class ClNormalizationFloat32Workload : public FloatWorkload<NormalizationQueueDescriptor> { public: ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp index 3147e95b2e..3c132cb8f8 100644 --- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp @@ -24,10 +24,10 @@ arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descripto return arm_compute::Status{}; } -template <armnn::DataType DataType> -ClPermuteWorkload<DataType>::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, +template <armnn::DataType... DataTypes> +ClPermuteWorkload<DataTypes...>::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload<PermuteQueueDescriptor, DataType>(descriptor, info) + : TypedWorkload<PermuteQueueDescriptor, DataTypes...>(descriptor, info) { using armcomputetensorutils::BuildArmComputePermutationVector; @@ -37,18 +37,18 @@ ClPermuteWorkload<DataType>::ClPermuteWorkload(const PermuteQueueDescriptor& des arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings; - // Run the layer + // Run the layer. m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings)); } -template <armnn::DataType DataType> -void ClPermuteWorkload<DataType>::Execute() const +template <armnn::DataType... DataTypes> +void ClPermuteWorkload<DataTypes...>::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, GetName() + "_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL( GetName() + "_Execute"); m_PermuteFunction.run(); } -template class ClPermuteWorkload<DataType::Float32>; +template class ClPermuteWorkload<DataType::Float16, DataType::Float32>; template class ClPermuteWorkload<DataType::QuantisedAsymm8>; } // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp index 430c59524e..c8726bc2c6 100644 --- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp @@ -7,6 +7,7 @@ #include "backends/Workload.hpp" #include "backends/WorkloadData.hpp" +#include "backends/ClWorkloadUtils.hpp" #include <armnn/TypesUtils.hpp> #include <arm_compute/runtime/CL/functions/CLPermute.h> @@ -18,13 +19,13 @@ namespace armnn arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descriptor); -template <armnn::DataType DataType> -class ClPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataType> +template<armnn::DataType... DataTypes> +class ClPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataTypes...> { public: static const std::string& GetName() { - static const std::string name = std::string("ClPermute") + GetDataTypeName(DataType) + "Workload"; + static const std::string name = std::string("ClPermuteWorkload"); return name; } @@ -32,11 +33,11 @@ public: void Execute() const override; private: - using TypedWorkload<PermuteQueueDescriptor, DataType>::m_Data; + using TypedWorkload<PermuteQueueDescriptor, DataTypes...>::m_Data; mutable arm_compute::CLPermute m_PermuteFunction; }; -using ClPermuteFloat32Workload = ClPermuteWorkload<DataType::Float32>; +using ClPermuteFloatWorkload = ClPermuteWorkload<DataType::Float16, DataType::Float32>; using ClPermuteUint8Workload = ClPermuteWorkload<DataType::QuantisedAsymm8>; -} //namespace armnn +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp index dbdc06f174..6b8a230912 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp @@ -25,10 +25,10 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input, return arm_compute::CLPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo); } -template <armnn::DataType dataType> -ClPooling2dBaseWorkload<dataType>::ClPooling2dBaseWorkload( +template <armnn::DataType... dataTypes> +ClPooling2dBaseWorkload<dataTypes...>::ClPooling2dBaseWorkload( const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name) - : TypedWorkload<Pooling2dQueueDescriptor, dataType>(descriptor, info) + : TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>(descriptor, info) { m_Data.ValidateInputsOutputs(name, 1, 1); @@ -37,11 +37,11 @@ ClPooling2dBaseWorkload<dataType>::ClPooling2dBaseWorkload( arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters); - // Run the layer + // Run the layer. m_PoolingLayer.configure(&input, &output, layerInfo); } -template class ClPooling2dBaseWorkload<DataType::Float32>; +template class ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>; template class ClPooling2dBaseWorkload<DataType::QuantisedAsymm8>; } diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp index 828f000505..aea32c9e86 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp @@ -14,12 +14,12 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const Pooling2dDescriptor& descriptor); -// Base class template providing an implementation of the Pooling2d layer common to all data types -template <armnn::DataType dataType> -class ClPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataType> +// Base class template providing an implementation of the Pooling2d layer common to all data types. +template <armnn::DataType... dataTypes> +class ClPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataTypes...> { public: - using TypedWorkload<Pooling2dQueueDescriptor, dataType>::m_Data; + using TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>::m_Data; ClPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name); diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp index a7f5855b8a..3a5b8ca526 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp @@ -10,13 +10,13 @@ namespace armnn ClPooling2dFloat32Workload::ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : ClPooling2dBaseWorkload<DataType::Float32>(descriptor, info, "ClPooling2dFloat32Workload") + : ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>(descriptor, info, "ClPooling2dFloat32Workload") { } void ClPooling2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dFloat32Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp index 3456a2cff8..ad189bdb52 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload<DataType::Float32> +class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32> { public: ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp index 2d2109e252..94cf753f5a 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp @@ -16,7 +16,7 @@ ClPooling2dUint8Workload::ClPooling2dUint8Workload(const Pooling2dQueueDescripto void ClPooling2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dUint8Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp index 7b4ad4415b..05fba222ac 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp @@ -11,7 +11,7 @@ namespace armnn { ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ReshapeQueueDescriptor>(descriptor, info) + : FloatWorkload<ReshapeQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClReshapeFloat32Workload", 1, 1); @@ -23,7 +23,7 @@ ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor& void ClReshapeFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp index e344ee08ad..0eb4d08da0 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClReshapeFloat32Workload : public Float32Workload<ReshapeQueueDescriptor> +class ClReshapeFloat32Workload : public FloatWorkload<ReshapeQueueDescriptor> { public: ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp index 36cc1dec17..050fb9aa33 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp @@ -21,7 +21,7 @@ ClReshapeUint8Workload::ClReshapeUint8Workload(const ReshapeQueueDescriptor& des void ClReshapeUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeUint8Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp index d71011a2e3..abef682611 100644 --- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp @@ -14,7 +14,7 @@ namespace armnn ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ResizeBilinearQueueDescriptor>(descriptor, info) + : FloatWorkload<ResizeBilinearQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClResizeBilinearFloat32Workload", 1, 1); @@ -28,7 +28,7 @@ ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBil void ClResizeBilinearFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClResizeBilinearFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClResizeBilinearFloat32Workload_Execute"); m_ResizeBilinearLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp index 5f70e71619..81c0566bb3 100644 --- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClResizeBilinearFloat32Workload : public Float32Workload<ResizeBilinearQueueDescriptor> +class ClResizeBilinearFloat32Workload : public FloatWorkload<ResizeBilinearQueueDescriptor> { public: ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp new file mode 100644 index 0000000000..cd3107cfe1 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClSoftmaxBaseWorkload.hpp" + +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + // NOTE: We report 4D Softmax as unsupported until full support is added to ACL + if(input.GetShape().GetNumDimensions() >= 4u) + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported"); + } + + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + return arm_compute::CLSoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo); +} + +} diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp new file mode 100644 index 0000000000..e0113134af --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp @@ -0,0 +1,16 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp index 1d05172b42..08247bc593 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp @@ -12,7 +12,7 @@ namespace armnn ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<SoftmaxQueueDescriptor>(descriptor, info) + : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info) , m_SoftmaxLayer(memoryManager) { m_Data.ValidateInputsOutputs("ClSoftmaxFloat32Workload", 1, 1); @@ -24,7 +24,7 @@ ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& void ClSoftmaxFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxFloat32Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp index cf5c45ac6f..6cad59800b 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class ClSoftmaxFloat32Workload : public Float32Workload<SoftmaxQueueDescriptor> +class ClSoftmaxFloat32Workload : public FloatWorkload<SoftmaxQueueDescriptor> { public: ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp index ee9ab4754b..3cd9a6a5ec 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp @@ -33,7 +33,7 @@ ClSoftmaxUint8Workload::ClSoftmaxUint8Workload(const SoftmaxQueueDescriptor& des void ClSoftmaxUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxUint8Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp index 6221d56766..8a622c6caf 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void ClSplitterFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterFloat32Workload_Execute"); ClBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp index cfc7eaa3c2..affa9f840f 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class ClSplitterFloat32Workload : public ClBaseSplitterWorkload<DataType::Float32> +class ClSplitterFloat32Workload : public ClBaseSplitterWorkload<DataType::Float16, DataType::Float32> { public: - using ClBaseSplitterWorkload<DataType::Float32>::ClBaseSplitterWorkload; + using ClBaseSplitterWorkload<DataType::Float16, DataType::Float32>::ClBaseSplitterWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp index 3aa470894c..d2d25495e0 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void ClSplitterUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterUint8Workload_Execute"); ClBaseSplitterWorkload::Execute(); } |