aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNarumol Prangnawarat <narumol.prangnawarat@arm.com>2020-03-30 16:11:04 +0100
committerNarumol Prangnawarat <narumol.prangnawarat@arm.com>2020-03-31 09:29:40 +0100
commit250d3927b16abe4d6932cd5dce1184bd7026a2b7 (patch)
treef73603873c0fbd692fbcbbd242d2a45cef6dc890
parente2062cdf1eb31b87860f9889f0e799e89f0dfa30 (diff)
downloadarmnn-250d3927b16abe4d6932cd5dce1184bd7026a2b7.tar.gz
IVGCVSW-4633 Add conversion of BF16 support to Neon
* Add NeonConvertBf16ToFp32Workload * Add NeonConvertFp32ToBf16Workload * Add BFloat16 type support to NeonConstantWorkload and NeonTensorHandle * Add ConvertBf16ToFp32Weight when ConvertBf16ToFp32Layer is added * Unit tests Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com> Change-Id: Id5b44a203add5e0c98c1ca4e2162115741b56644
-rw-r--r--src/armnn/Network.cpp32
-rw-r--r--src/backends/aclCommon/ArmComputeTensorUtils.cpp2
-rw-r--r--src/backends/neon/NeonLayerSupport.cpp20
-rw-r--r--src/backends/neon/NeonLayerSupport.hpp8
-rw-r--r--src/backends/neon/NeonTensorHandle.hpp9
-rw-r--r--src/backends/neon/NeonWorkloadFactory.cpp14
-rw-r--r--src/backends/neon/NeonWorkloadFactory.hpp6
-rw-r--r--src/backends/neon/backend.mk2
-rw-r--r--src/backends/neon/test/NeonLayerTests.cpp6
-rw-r--r--src/backends/neon/workloads/CMakeLists.txt4
-rw-r--r--src/backends/neon/workloads/NeonConstantWorkload.cpp6
-rw-r--r--src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp43
-rw-r--r--src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp26
-rw-r--r--src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp44
-rw-r--r--src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp26
-rw-r--r--src/backends/neon/workloads/NeonWorkloads.hpp2
16 files changed, 250 insertions, 0 deletions
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index c2da4da41e..a443721a45 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -146,6 +146,30 @@ bool CheckScaleSetOnQuantizedType(Layer* layer, Optional<std::vector<std::string
return noErrors;
}
+template <typename LayerT>
+LayerT* ConvertBf16ToFp32Weight(Layer* l)
+{
+ LayerT* layer = boost::polymorphic_downcast<LayerT*>(l);
+ if ((layer->GetType() == LayerType::Convolution2d || layer->GetType() == LayerType::FullyConnected)
+ && layer->m_Weight)
+ {
+ const TensorInfo& info = layer->m_Weight->GetTensorInfo();
+
+ if (info.GetDataType() == DataType::BFloat16)
+ {
+ std::vector<float> newValues(info.GetNumElements());
+
+ armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(
+ layer->m_Weight->template GetTensor<armnn::BFloat16>(), info.GetNumElements(), newValues.data());
+
+ TensorInfo newInfo(info.GetShape(), DataType::Float32);
+ ConstTensor newInput(newInfo, newValues);
+ layer->m_Weight.reset(new ScopedCpuTensorHandle(newInput));
+ }
+ }
+ return layer;
+}
+
OptimizationResult AttemptBackendAssignment(BackendSettings& backendSettings,
Graph& graph,
Layer* layer,
@@ -260,6 +284,14 @@ OptimizationResult AttemptBackendAssignment(BackendSettings& backendSettings,
{
convertBf16ToFp32Layers =
InsertConvertBf16ToFp32LayersBefore(graph, *layer);
+ if (layer->GetType() == LayerType::Convolution2d)
+ {
+ ConvertBf16ToFp32Weight<Convolution2dLayer>(layer);
+ }
+ else if (layer->GetType() == LayerType::FullyConnected)
+ {
+ ConvertBf16ToFp32Weight<FullyConnectedLayer>(layer);
+ }
}
// Insert FP32 -> BF16 conversion layer after current layer
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.cpp b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
index 84091e8fb3..f5a9e05de9 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.cpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
@@ -17,6 +17,8 @@ arm_compute::DataType GetArmComputeDataType(armnn::DataType dataType, bool multi
{
switch(dataType)
{
+ case armnn::DataType::BFloat16:
+ return arm_compute::DataType::BFLOAT16;
case armnn::DataType::Boolean:
return arm_compute::DataType::U8;
case armnn::DataType::Float16:
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index c01a178f18..44e84fb974 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -259,6 +259,16 @@ bool NeonLayerSupport::IsConstantSupported(const TensorInfo& output,
&TrueFunc<>);
}
+bool NeonLayerSupport::IsConvertBf16ToFp32Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported) const
+{
+ armnn::IgnoreUnused(input);
+ armnn::IgnoreUnused(output);
+ armnn::IgnoreUnused(reasonIfUnsupported);
+ return true;
+}
+
bool NeonLayerSupport::IsConvertFp16ToFp32Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported) const
@@ -269,6 +279,16 @@ bool NeonLayerSupport::IsConvertFp16ToFp32Supported(const TensorInfo& input,
return true;
}
+bool NeonLayerSupport::IsConvertFp32ToBf16Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported) const
+{
+ armnn::IgnoreUnused(input);
+ armnn::IgnoreUnused(output);
+ armnn::IgnoreUnused(reasonIfUnsupported);
+ return true;
+}
+
bool NeonLayerSupport::IsConvertFp32ToFp16Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index f45db35ffe..fda0bc30d3 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -60,10 +60,18 @@ public:
bool IsConstantSupported(const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+ bool IsConvertBf16ToFp32Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
bool IsConvertFp16ToFp32Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+ bool IsConvertFp32ToBf16Supported(const TensorInfo& input,
+ const TensorInfo& output,
+ Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
bool IsConvertFp32ToFp16Supported(const TensorInfo& input,
const TensorInfo& output,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp
index 2e9be11be1..11d20878d7 100644
--- a/src/backends/neon/NeonTensorHandle.hpp
+++ b/src/backends/neon/NeonTensorHandle.hpp
@@ -4,6 +4,7 @@
//
#pragma once
+#include <BFloat16.hpp>
#include <Half.hpp>
#include <aclCommon/ArmComputeTensorHandle.hpp>
@@ -176,6 +177,10 @@ private:
armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
static_cast<uint8_t*>(memory));
break;
+ case arm_compute::DataType::BFLOAT16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<armnn::BFloat16*>(memory));
+ break;
case arm_compute::DataType::F16:
armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
static_cast<armnn::Half*>(memory));
@@ -210,6 +215,10 @@ private:
armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
this->GetTensor());
break;
+ case arm_compute::DataType::BFLOAT16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::BFloat16*>(memory),
+ this->GetTensor());
+ break;
case arm_compute::DataType::F16:
armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
this->GetTensor());
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index 982104e38e..47f72050a5 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -154,6 +154,13 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConstant(const ConstantQue
return std::make_unique<NeonConstantWorkload>(descriptor, info);
}
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertBf16ToFp32(
+ const ConvertBf16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertBf16ToFp32Workload>(descriptor, info);
+}
+
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
const ConvertFp16ToFp32QueueDescriptor& descriptor,
const WorkloadInfo& info) const
@@ -161,6 +168,13 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info);
}
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToBf16(
+ const ConvertFp32ToBf16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertFp32ToBf16Workload>(descriptor, info);
+}
+
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
const ConvertFp32ToFp16QueueDescriptor& descriptor,
const WorkloadInfo& info) const
diff --git a/src/backends/neon/NeonWorkloadFactory.hpp b/src/backends/neon/NeonWorkloadFactory.hpp
index f122792203..d6968fa9dd 100644
--- a/src/backends/neon/NeonWorkloadFactory.hpp
+++ b/src/backends/neon/NeonWorkloadFactory.hpp
@@ -66,9 +66,15 @@ public:
std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
+ std::unique_ptr<IWorkload> CreateConvertBf16ToFp32(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
+ std::unique_ptr<IWorkload> CreateConvertFp32ToBf16(const ConvertFp32ToBf16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 3b3333a0c3..c6b22306ea 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -30,6 +30,8 @@ BACKEND_SOURCES := \
workloads/NeonComparisonWorkload.cpp \
workloads/NeonConcatWorkload.cpp \
workloads/NeonConstantWorkload.cpp \
+ workloads/NeonConvertBf16ToFp32Workload.cpp \
+ workloads/NeonConvertFp32ToBf16Workload.cpp \
workloads/NeonConvertFp16ToFp32Workload.cpp \
workloads/NeonConvertFp32ToFp16Workload.cpp \
workloads/NeonConvolution2dWorkload.cpp \
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 19a89160c4..20644fe47f 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -509,6 +509,12 @@ ARMNN_AUTO_TEST_CASE(ConcatUint8, ConcatUint8Test)
ARMNN_AUTO_TEST_CASE(ConcatUint8DifferentInputOutputQParam,
ConcatDifferentInputOutputQParamTest<DataType::QAsymmU8>, false)
+// Convert from BFloat16 to Float32
+ARMNN_AUTO_TEST_CASE(ConvertBf16ToFp32, ConvertBf16ToFp32Test)
+
+// Convert from Float32 to BFloat16
+ARMNN_AUTO_TEST_CASE(ConvertFp32ToBf16, ConvertFp32ToBf16Test)
+
// Fully Connected
ARMNN_AUTO_TEST_CASE(SimpleFullyConnected, FullyConnectedFloat32Test, false, false)
ARMNN_AUTO_TEST_CASE(SimpleFullyConnectedWithBias, FullyConnectedFloat32Test, true, false)
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index cbe1e3b945..7db315f116 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -22,8 +22,12 @@ list(APPEND armnnNeonBackendWorkloads_sources
NeonConcatWorkload.hpp
NeonConstantWorkload.cpp
NeonConstantWorkload.hpp
+ NeonConvertBf16ToFp32Workload.cpp
+ NeonConvertBf16ToFp32Workload.hpp
NeonConvertFp16ToFp32Workload.cpp
NeonConvertFp16ToFp32Workload.hpp
+ NeonConvertFp32ToBf16Workload.cpp
+ NeonConvertFp32ToBf16Workload.hpp
NeonConvertFp32ToFp16Workload.cpp
NeonConvertFp32ToFp16Workload.hpp
NeonConvolution2dWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonConstantWorkload.cpp b/src/backends/neon/workloads/NeonConstantWorkload.cpp
index 08f0390e34..83a2692b6e 100644
--- a/src/backends/neon/workloads/NeonConstantWorkload.cpp
+++ b/src/backends/neon/workloads/NeonConstantWorkload.cpp
@@ -6,6 +6,7 @@
#include "NeonConstantWorkload.hpp"
#include <arm_compute/core/Types.h>
+#include <BFloat16.hpp>
#include <Half.hpp>
#include <aclCommon/ArmComputeTensorUtils.hpp>
#include <neon/NeonTensorHandle.hpp>
@@ -46,6 +47,11 @@ void NeonConstantWorkload::Execute() const
switch (computeDataType)
{
+ case arm_compute::DataType::BFLOAT16:
+ {
+ CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<BFloat16>(), output);
+ break;
+ }
case arm_compute::DataType::F16:
{
CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<Half>(), output);
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
new file mode 100644
index 0000000000..79d1f22313
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertBf16ToFp32Workload.hpp"
+
+#include <armnnUtils/FloatingPointConverter.hpp>
+
+#include <BFloat16.hpp>
+
+#include <backendsCommon/WorkloadUtils.hpp>
+
+namespace armnn
+{
+
+NeonConvertBf16ToFp32Workload::NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertBf16ToFp32Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertBf16ToFp32Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertBf16ToFp32Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const BFloat16*>(src);
+ auto output = reinterpret_cast<float*>(dst);
+ size_t numElements = size/2; // 2 bytes per Bf16
+ armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
new file mode 100644
index 0000000000..0969088b92
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertBf16ToFp32Workload : public BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>
+{
+public:
+ NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
new file mode 100644
index 0000000000..e1aceec197
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
@@ -0,0 +1,44 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertFp32ToBf16Workload.hpp"
+
+#include <BFloat16.hpp>
+#include <Profiling.hpp>
+
+#include <armnnUtils/FloatingPointConverter.hpp>
+
+#include <backendsCommon/WorkloadUtils.hpp>
+
+namespace armnn
+{
+
+NeonConvertFp32ToBf16Workload::NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Float32ToBFloat16Workload<ConvertFp32ToBf16QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToBf16Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertFp32ToBf16Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToBf16Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const float*>(src);
+ auto output = reinterpret_cast<BFloat16*>(dst);
+ size_t numElements = size/2; // 2 bytes per bf16
+ armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
new file mode 100644
index 0000000000..bc96c16287
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertFp32ToBf16Workload : public Float32ToBFloat16Workload<ConvertFp32ToBf16QueueDescriptor>
+{
+public:
+ NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index 2b7eabeb0d..f25554722d 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -13,7 +13,9 @@
#include "NeonBatchToSpaceNdWorkload.hpp"
#include "NeonComparisonWorkload.hpp"
#include "NeonConstantWorkload.hpp"
+#include "NeonConvertBf16ToFp32Workload.hpp"
#include "NeonConvertFp16ToFp32Workload.hpp"
+#include "NeonConvertFp32ToBf16Workload.hpp"
#include "NeonConvertFp32ToFp16Workload.hpp"
#include "NeonConvolution2dWorkload.hpp"
#include "NeonDepthToSpaceWorkload.hpp"