From 0b51d5ad533f8ecde71f957077690195eea29ffc Mon Sep 17 00:00:00 2001
From: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Date: Wed, 20 Jan 2021 15:58:29 +0000
Subject: IVGCVSW-5619 Add OptimizerOptions and NetworkProperties to ArmNN
 Delegate

 * Add OptimizerOptions, NetworkProperties, DebugCallbackFunction
to DelegateOptions
 * Enable OptimizerOptions when the network is being optimized
 * Enable NetworkProperties when loading network
 * Enable DebugCallbackFunction
 * Add error message when loading network
 * Log warning instead of error when operator is not supported but
could fallback to another backend
 * Improve uint16_t CompareData
 * Unit tests

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: I353035afb442774bfeb1c62570a90755c2ceaf38
---
 delegate/CMakeLists.txt                            |   2 +
 delegate/include/DelegateOptions.hpp               |  42 +++
 delegate/src/DelegateOptions.cpp                   |  28 ++
 delegate/src/DelegateUtils.hpp                     |   7 +-
 delegate/src/armnn_delegate.cpp                    |  22 +-
 delegate/src/test/ArmnnDelegateTest.cpp            |  34 ++-
 delegate/src/test/DelegateOptionsTest.cpp          | 157 +++++++++++
 delegate/src/test/DelegateOptionsTestHelper.hpp    | 298 +++++++++++++++++++++
 delegate/src/test/TestUtils.cpp                    |  13 +-
 .../optimizations/ConvertFp32NetworkToBf16.hpp     |   3 +-
 10 files changed, 593 insertions(+), 13 deletions(-)
 create mode 100644 delegate/src/test/DelegateOptionsTest.cpp
 create mode 100644 delegate/src/test/DelegateOptionsTestHelper.hpp

diff --git a/delegate/CMakeLists.txt b/delegate/CMakeLists.txt
index 495c1e3d77..ba8ba6de00 100644
--- a/delegate/CMakeLists.txt
+++ b/delegate/CMakeLists.txt
@@ -118,6 +118,8 @@ if(BUILD_UNIT_TESTS)
         src/test/ControlTestHelper.hpp
         src/test/Convolution2dTest.cpp
         src/test/ConvolutionTestHelper.hpp
+        src/test/DelegateOptionsTest.cpp
+        src/test/DelegateOptionsTestHelper.hpp
         src/test/DepthwiseConvolution2dTest.cpp
         src/test/ElementwiseBinaryTest.cpp
         src/test/ElementwiseBinaryTestHelper.hpp
diff --git a/delegate/include/DelegateOptions.hpp b/delegate/include/DelegateOptions.hpp
index 6058061b3d..82de07607e 100644
--- a/delegate/include/DelegateOptions.hpp
+++ b/delegate/include/DelegateOptions.hpp
@@ -27,6 +27,18 @@ public:
                     const std::vector<armnn::BackendOptions>& backendOptions = {},
                     armnn::Optional<armnn::LogSeverity> logSeverityLevel = armnn::EmptyOptional());
 
+    DelegateOptions(armnn::Compute computeDevice,
+                    const armnn::OptimizerOptions& optimizerOptions,
+                    const armnn::INetworkProperties& networkProperties = armnn::INetworkProperties(),
+                    const armnn::Optional<armnn::LogSeverity>& logSeverityLevel = armnn::EmptyOptional(),
+                    const armnn::Optional<armnn::DebugCallbackFunction>& func = armnn::EmptyOptional());
+
+    DelegateOptions(const std::vector<armnn::BackendId>& backends,
+                    const armnn::OptimizerOptions& optimizerOptions,
+                    const armnn::INetworkProperties& networkProperties = armnn::INetworkProperties(),
+                    const armnn::Optional<armnn::LogSeverity>& logSeverityLevel = armnn::EmptyOptional(),
+                    const armnn::Optional<armnn::DebugCallbackFunction>& func = armnn::EmptyOptional());
+
     const std::vector<armnn::BackendId>& GetBackends() const { return m_Backends; }
 
     void SetBackends(const std::vector<armnn::BackendId>& backends) { m_Backends = backends; }
@@ -45,6 +57,13 @@ public:
 
     bool IsLoggingEnabled() { return m_LoggingSeverity.has_value(); }
 
+    const armnn::OptimizerOptions& GetOptimizerOptions() const { return m_OptimizerOptions; }
+
+    const armnn::Optional<armnn::DebugCallbackFunction>& GetDebugCallbackFunction() const
+        { return m_DebugCallbackFunc; }
+
+    const armnn::INetworkProperties& GetNetworkProperties() const { return m_NetworkProperties; };
+
 private:
     /// Which backend to run Delegate on.
     /// Examples of possible values are: CpuRef, CpuAcc, GpuAcc.
@@ -70,8 +89,31 @@ private:
     ///   "KernelProfilingEnabled" : bool [true | false]
     std::vector<armnn::BackendOptions> m_BackendOptions;
 
+    /// OptimizerOptions
+    /// Reduce Fp32 data to Fp16 for faster processing
+    /// bool m_ReduceFp32ToFp16;
+    /// Add debug data for easier troubleshooting
+    /// bool m_Debug;
+    /// Reduce Fp32 data to Bf16 for faster processing
+    /// bool m_ReduceFp32ToBf16;
+    /// Infer output size when not available
+    /// ShapeInferenceMethod m_shapeInferenceMethod;
+    /// Enable Import
+    /// bool m_ImportEnabled;
+    /// Enable Model Options
+    /// ModelOptions m_ModelOptions;
+    armnn::OptimizerOptions m_OptimizerOptions;
+
+    /// Network properties to enable memory import
+    armnn::INetworkProperties m_NetworkProperties;
+
     /// Severity level for logging within ArmNN that will be used on creation of the delegate
     armnn::Optional<armnn::LogSeverity> m_LoggingSeverity;
+
+    /// A callback function to debug layers performing custom computations on intermediate tensors.
+    /// If a function is not registered, and debug is enabled in OptimizerOptions,
+    /// debug will print information of the intermediate tensors.
+     armnn::Optional<armnn::DebugCallbackFunction> m_DebugCallbackFunc;
 };
 
 } // namespace armnnDelegate
diff --git a/delegate/src/DelegateOptions.cpp b/delegate/src/DelegateOptions.cpp
index 3ec2d20d77..400bf78766 100644
--- a/delegate/src/DelegateOptions.cpp
+++ b/delegate/src/DelegateOptions.cpp
@@ -22,4 +22,32 @@ DelegateOptions::DelegateOptions(const std::vector<armnn::BackendId>& backends,
 {
 }
 
+DelegateOptions::DelegateOptions(armnn::Compute computeDevice,
+                                 const armnn::OptimizerOptions& optimizerOptions,
+                                 const armnn::INetworkProperties& networkProperties,
+                                 const armnn::Optional<armnn::LogSeverity>& logSeverityLevel,
+                                 const armnn::Optional<armnn::DebugCallbackFunction>& func)
+    : m_Backends({computeDevice}),
+      m_BackendOptions({}),
+      m_OptimizerOptions(optimizerOptions),
+      m_NetworkProperties(networkProperties),
+      m_LoggingSeverity(logSeverityLevel),
+      m_DebugCallbackFunc(func)
+{
+}
+
+DelegateOptions::DelegateOptions(const std::vector<armnn::BackendId>& backends,
+                                 const armnn::OptimizerOptions& optimizerOptions,
+                                 const armnn::INetworkProperties& networkProperties,
+                                 const armnn::Optional<armnn::LogSeverity>& logSeverityLevel,
+                                 const armnn::Optional<armnn::DebugCallbackFunction>& func)
+    : m_Backends(backends),
+      m_BackendOptions({}),
+      m_OptimizerOptions(optimizerOptions),
+      m_NetworkProperties(networkProperties),
+      m_LoggingSeverity(logSeverityLevel),
+      m_DebugCallbackFunc(func)
+{
+}
+
 } // namespace armnnDelegate
diff --git a/delegate/src/DelegateUtils.hpp b/delegate/src/DelegateUtils.hpp
index 990f210734..58eeb9ab63 100644
--- a/delegate/src/DelegateUtils.hpp
+++ b/delegate/src/DelegateUtils.hpp
@@ -42,12 +42,13 @@ try \
             { \
                 if (reasonIfUnsupported.size() > 0) \
                 { \
-                    TF_LITE_KERNEL_LOG( \
-                        tfLiteContext, "%s: not supported by armnn: %s", funcName, reasonIfUnsupported.c_str()); \
+                    TFLITE_LOG_PROD(tflite::TFLITE_LOG_WARNING, \
+                                    "%s: not supported by armnn: %s", funcName, reasonIfUnsupported.c_str()); \
                 } \
                 else \
                 { \
-                    TF_LITE_KERNEL_LOG(tfLiteContext, "%s: not supported by armnn", funcName); \
+                    TFLITE_LOG_PROD(tflite::TFLITE_LOG_WARNING, \
+                                    "%s: not supported by armnn", funcName); \
                 } \
             } \
         } \
diff --git a/delegate/src/armnn_delegate.cpp b/delegate/src/armnn_delegate.cpp
index 6250a5f638..6dba890509 100644
--- a/delegate/src/armnn_delegate.cpp
+++ b/delegate/src/armnn_delegate.cpp
@@ -134,6 +134,10 @@ Delegate::Delegate(armnnDelegate::DelegateOptions options)
     {
         runtimeOptions.m_BackendOptions = backendOptions;
     }
+    else if (!m_Options.GetOptimizerOptions().m_ModelOptions.empty())
+    {
+        runtimeOptions.m_BackendOptions = m_Options.GetOptimizerOptions().m_ModelOptions;
+    }
     m_Runtime = armnn::IRuntime::Create(runtimeOptions);
 
     std::vector<armnn::BackendId> backends;
@@ -288,7 +292,6 @@ ArmnnSubgraph* ArmnnSubgraph::Create(TfLiteContext* tfLiteContext,
 
     delegateData.m_OutputSlotForNode = std::vector<armnn::IOutputSlot*>(tfLiteContext->tensors_size, nullptr);
 
-
     std::vector<armnn::BindingPointInfo> inputBindings;
     std::vector<armnn::BindingPointInfo> outputBindings;
 
@@ -331,7 +334,8 @@ ArmnnSubgraph* ArmnnSubgraph::Create(TfLiteContext* tfLiteContext,
     {
         optNet = armnn::Optimize(*(delegateData.m_Network.get()),
                                  delegate->m_Options.GetBackends(),
-                                 delegate->m_Runtime->GetDeviceSpec());
+                                 delegate->m_Runtime->GetDeviceSpec(),
+                                 delegate->m_Options.GetOptimizerOptions());
     }
     catch (std::exception &ex)
     {
@@ -348,11 +352,15 @@ ArmnnSubgraph* ArmnnSubgraph::Create(TfLiteContext* tfLiteContext,
     try
     {
         // Load graph into runtime
-        auto loadingStatus = delegate->m_Runtime->LoadNetwork(networkId, std::move(optNet));
+        std::string errorMessage;
+        auto loadingStatus = delegate->m_Runtime->LoadNetwork(networkId,
+                                                              std::move(optNet),
+                                                              errorMessage,
+                                                              delegate->m_Options.GetNetworkProperties());
         if (loadingStatus != armnn::Status::Success)
         {
             // Optimize failed
-            throw armnn::Exception("TfLiteArmnnDelegate: Network could not be loaded!");;
+            throw armnn::Exception("TfLiteArmnnDelegate: Network could not be loaded:" + errorMessage);
         }
     }
     catch (std::exception& ex)
@@ -362,6 +370,12 @@ ArmnnSubgraph* ArmnnSubgraph::Create(TfLiteContext* tfLiteContext,
         throw armnn::Exception(exMessage.str());
     }
 
+    // Register debug callback function
+    if (delegate->m_Options.GetDebugCallbackFunction().has_value())
+    {
+        delegate->m_Runtime->RegisterDebugCallback(networkId, delegate->m_Options.GetDebugCallbackFunction().value());
+    }
+
     // Create a new SubGraph with networkId and runtime
     return new ArmnnSubgraph(networkId, delegate->m_Runtime.get(), inputBindings, outputBindings);
 }
diff --git a/delegate/src/test/ArmnnDelegateTest.cpp b/delegate/src/test/ArmnnDelegateTest.cpp
index 84bc31172d..bc73dde2ef 100644
--- a/delegate/src/test/ArmnnDelegateTest.cpp
+++ b/delegate/src/test/ArmnnDelegateTest.cpp
@@ -21,7 +21,7 @@ TEST_SUITE("ArmnnDelegate")
 TEST_CASE ("ArmnnDelegate Registered")
 {
     using namespace tflite;
-    auto tfLiteInterpreter =  std::make_unique<Interpreter>();
+    auto tfLiteInterpreter = std::make_unique<Interpreter>();
 
     tfLiteInterpreter->AddTensors(3);
     tfLiteInterpreter->SetInputs({0, 1});
@@ -56,6 +56,38 @@ TEST_CASE ("ArmnnDelegate Registered")
     CHECK(tfLiteInterpreter != nullptr);
 }
 
+TEST_CASE ("ArmnnDelegateOptimizerOptionsRegistered")
+{
+    using namespace tflite;
+    auto tfLiteInterpreter = std::make_unique<Interpreter>();
+
+    tfLiteInterpreter->AddTensors(3);
+    tfLiteInterpreter->SetInputs({0, 1});
+    tfLiteInterpreter->SetOutputs({2});
+
+    tfLiteInterpreter->SetTensorParametersReadWrite(0, kTfLiteFloat32, "input1", {1,2,2,1}, TfLiteQuantization());
+    tfLiteInterpreter->SetTensorParametersReadWrite(1, kTfLiteFloat32, "input2", {1,2,2,1}, TfLiteQuantization());
+    tfLiteInterpreter->SetTensorParametersReadWrite(2, kTfLiteFloat32, "output", {1,2,2,1}, TfLiteQuantization());
+
+    tflite::ops::builtin::BuiltinOpResolver opResolver;
+    const TfLiteRegistration* opRegister = opResolver.FindOp(BuiltinOperator_ADD, 1);
+    tfLiteInterpreter->AddNodeWithParameters({0, 1}, {2}, "", 0, nullptr, opRegister);
+
+    // Create the Armnn Delegate
+    std::vector<armnn::BackendId> backends = { armnn::Compute::CpuRef };
+
+    armnn::OptimizerOptions optimizerOptions(true, true, false, true);
+
+    armnnDelegate::DelegateOptions delegateOptions(backends, optimizerOptions);
+    std::unique_ptr<TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+                       theArmnnDelegate(armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
+                                        armnnDelegate::TfLiteArmnnDelegateDelete);
+
+    auto status = tfLiteInterpreter->ModifyGraphWithDelegate(std::move(theArmnnDelegate));
+    CHECK(status == kTfLiteOk);
+    CHECK(tfLiteInterpreter != nullptr);
+}
+
 }
 
 } // namespace armnnDelegate
diff --git a/delegate/src/test/DelegateOptionsTest.cpp b/delegate/src/test/DelegateOptionsTest.cpp
new file mode 100644
index 0000000000..c623781301
--- /dev/null
+++ b/delegate/src/test/DelegateOptionsTest.cpp
@@ -0,0 +1,157 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "DelegateOptionsTestHelper.hpp"
+
+namespace armnnDelegate
+{
+
+TEST_SUITE("DelegateOptions")
+{
+
+TEST_CASE ("ArmnnDelegateOptimizerOptionsReduceFp32ToFp16")
+{
+    std::stringstream ss;
+    {
+        StreamRedirector redirect(std::cout, ss.rdbuf());
+
+        std::vector<armnn::BackendId> backends = { armnn::Compute::CpuRef };
+        std::vector<int32_t> tensorShape { 1, 2, 2, 1 };
+        std::vector<float> inputData = { 1, 2, 3, 4 };
+        std::vector<float> divData = { 2, 2, 3, 4 };
+        std::vector<float> expectedResult = { 1, 2, 2, 2 };
+
+        // Enable ReduceFp32ToFp16
+        armnn::OptimizerOptions optimizerOptions(true, true, false, false);
+        armnn::INetworkProperties networkProperties;
+        armnnDelegate::DelegateOptions delegateOptions(backends, optimizerOptions, networkProperties);
+
+        DelegateOptionTest<float>(::tflite::TensorType_FLOAT32,
+                                  backends,
+                                  tensorShape,
+                                  inputData,
+                                  inputData,
+                                  divData,
+                                  expectedResult,
+                                  delegateOptions);
+    }
+    // ReduceFp32ToFp16 option is enabled
+    CHECK(ss.str().find("convert_fp32_to_fp16") != std::string::npos);
+    CHECK(ss.str().find("convert_fp16_to_fp32") != std::string::npos);
+}
+
+TEST_CASE ("ArmnnDelegateOptimizerOptionsDebug")
+{
+    std::stringstream ss;
+    {
+        StreamRedirector redirect(std::cout, ss.rdbuf());
+
+        std::vector<armnn::BackendId> backends = { armnn::Compute::CpuRef };
+        std::vector<int32_t> tensorShape { 1, 2, 2, 1 };
+        std::vector<float> inputData = { 1, 2, 3, 4 };
+        std::vector<float> divData = { 2, 2, 3, 4 };
+        std::vector<float> expectedResult = { 1, 2, 2, 2 };
+
+        // Enable Debug
+        armnn::OptimizerOptions optimizerOptions(false, true, false, false);
+        armnn::INetworkProperties networkProperties;
+        armnnDelegate::DelegateOptions delegateOptions(backends, optimizerOptions, networkProperties);
+
+        DelegateOptionTest<float>(::tflite::TensorType_FLOAT32,
+                                  backends,
+                                  tensorShape,
+                                  inputData,
+                                  inputData,
+                                  divData,
+                                  expectedResult,
+                                  delegateOptions);
+    }
+    // Debug option triggered.
+    CHECK(ss.str().find("layerGuid") != std::string::npos);
+    CHECK(ss.str().find("layerName") != std::string::npos);
+    CHECK(ss.str().find("outputSlot") != std::string::npos);
+    CHECK(ss.str().find("shape") != std::string::npos);
+    CHECK(ss.str().find("data") != std::string::npos);
+}
+
+TEST_CASE ("ArmnnDelegateOptimizerOptionsDebugFunction")
+{
+    std::vector<armnn::BackendId> backends = { armnn::Compute::CpuRef };
+    std::vector<int32_t> tensorShape { 1, 2, 2, 1 };
+    std::vector<float> inputData = { 1, 2, 3, 4 };
+    std::vector<float> divData = { 2, 2, 3, 4 };
+    std::vector<float> expectedResult = { 1, 2, 2, 2 };
+
+    // Enable debug with debug callback function
+    armnn::OptimizerOptions optimizerOptions(false, true, false, false);
+    bool callback = false;
+    auto mockCallback = [&](armnn::LayerGuid guid, unsigned int slotIndex, armnn::ITensorHandle* tensor)
+    {
+        armnn::IgnoreUnused(guid);
+        armnn::IgnoreUnused(slotIndex);
+        armnn::IgnoreUnused(tensor);
+        callback = true;
+    };
+
+    armnn::INetworkProperties networkProperties;
+    armnnDelegate::DelegateOptions delegateOptions(backends,
+                                                   optimizerOptions,
+                                                   networkProperties,
+                                                   armnn::EmptyOptional(),
+                                                   armnn::Optional<armnn::DebugCallbackFunction>(mockCallback));
+
+    CHECK(!callback);
+
+    DelegateOptionTest<float>(::tflite::TensorType_FLOAT32,
+                              backends,
+                              tensorShape,
+                              inputData,
+                              inputData,
+                              divData,
+                              expectedResult,
+                              delegateOptions);
+
+    // Check that the debug callback function was called.
+    CHECK(callback);
+}
+
+TEST_CASE ("ArmnnDelegateOptimizerOptionsReduceFp32ToBf16")
+{
+    std::stringstream ss;
+    {
+        StreamRedirector redirect(std::cout, ss.rdbuf());
+
+        ReduceFp32ToBf16TestImpl();
+    }
+
+    // ReduceFp32ToBf16 option is enabled
+    CHECK(ss.str().find("convert_fp32_to_bf16") != std::string::npos);
+}
+
+TEST_CASE ("ArmnnDelegateOptimizerOptionsImport")
+{
+    std::vector<armnn::BackendId> backends = {  armnn::Compute::CpuAcc, armnn::Compute::CpuRef };
+    std::vector<int32_t> tensorShape { 1, 2, 2, 1 };
+    std::vector<uint8_t> inputData = { 1, 2, 3, 4 };
+    std::vector<uint8_t> divData = { 2, 2, 3, 4 };
+    std::vector<uint8_t> expectedResult = { 1, 2, 2, 2};
+
+    armnn::OptimizerOptions optimizerOptions(false, false, false, true);
+    armnn::INetworkProperties networkProperties(true, true);
+    armnnDelegate::DelegateOptions delegateOptions(backends, optimizerOptions, networkProperties);
+
+    DelegateOptionTest<uint8_t>(::tflite::TensorType_UINT8,
+                                backends,
+                                tensorShape,
+                                inputData,
+                                inputData,
+                                divData,
+                                expectedResult,
+                                delegateOptions);
+}
+
+}
+
+} // namespace armnnDelegate
diff --git a/delegate/src/test/DelegateOptionsTestHelper.hpp b/delegate/src/test/DelegateOptionsTestHelper.hpp
new file mode 100644
index 0000000000..6e0cc3154c
--- /dev/null
+++ b/delegate/src/test/DelegateOptionsTestHelper.hpp
@@ -0,0 +1,298 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn_delegate.hpp>
+
+#include "ConvolutionTestHelper.hpp"
+#include "TestUtils.hpp"
+
+#include <flatbuffers/flatbuffers.h>
+#include <tensorflow/lite/interpreter.h>
+#include <tensorflow/lite/kernels/register.h>
+#include <tensorflow/lite/model.h>
+#include <tensorflow/lite/schema/schema_generated.h>
+#include <tensorflow/lite/version.h>
+
+#include <doctest/doctest.h>
+
+namespace
+{
+
+struct StreamRedirector
+{
+public:
+    StreamRedirector(std::ostream &stream, std::streambuf *newStreamBuffer)
+        : m_Stream(stream), m_BackupBuffer(m_Stream.rdbuf(newStreamBuffer)) {}
+
+    ~StreamRedirector() { m_Stream.rdbuf(m_BackupBuffer); }
+
+private:
+    std::ostream &m_Stream;
+    std::streambuf *m_BackupBuffer;
+};
+
+std::vector<char> CreateAddDivTfLiteModel(tflite::TensorType tensorType,
+                                          const std::vector<int32_t>& tensorShape,
+                                          float quantScale = 1.0f,
+                                          int quantOffset  = 0)
+{
+    using namespace tflite;
+    flatbuffers::FlatBufferBuilder flatBufferBuilder;
+
+    std::vector<flatbuffers::Offset<tflite::Buffer>> buffers;
+    buffers.push_back(CreateBuffer(flatBufferBuilder, flatBufferBuilder.CreateVector({})));
+
+    auto quantizationParameters =
+        CreateQuantizationParameters(flatBufferBuilder,
+                                     0,
+                                     0,
+                                     flatBufferBuilder.CreateVector<float>({ quantScale }),
+                                     flatBufferBuilder.CreateVector<int64_t>({ quantOffset }));
+
+
+    std::array<flatbuffers::Offset<Tensor>, 5> tensors;
+    tensors[0] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(tensorShape.data(),
+                                                                      tensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("input_0"),
+                              quantizationParameters);
+    tensors[1] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(tensorShape.data(),
+                                                                      tensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("input_1"),
+                              quantizationParameters);
+    tensors[2] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(tensorShape.data(),
+                                                                      tensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("input_2"),
+                              quantizationParameters);
+    tensors[3] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(tensorShape.data(),
+                                                                      tensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("add"),
+                              quantizationParameters);
+    tensors[4] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(tensorShape.data(),
+                                                                      tensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("output"),
+                              quantizationParameters);
+
+    // create operator
+    tflite::BuiltinOptions addBuiltinOptionsType = tflite::BuiltinOptions_AddOptions;
+    flatbuffers::Offset<void> addBuiltinOptions =
+        CreateAddOptions(flatBufferBuilder, ActivationFunctionType_NONE).Union();
+
+    tflite::BuiltinOptions divBuiltinOptionsType = tflite::BuiltinOptions_DivOptions;
+    flatbuffers::Offset<void> divBuiltinOptions =
+        CreateAddOptions(flatBufferBuilder, ActivationFunctionType_NONE).Union();
+
+    std::array<flatbuffers::Offset<Operator>, 2> operators;
+    const std::vector<int32_t> addInputs{0, 1};
+    const std::vector<int32_t> addOutputs{3};
+    operators[0] = CreateOperator(flatBufferBuilder,
+                                  0,
+                                  flatBufferBuilder.CreateVector<int32_t>(addInputs.data(), addInputs.size()),
+                                  flatBufferBuilder.CreateVector<int32_t>(addOutputs.data(), addOutputs.size()),
+                                  addBuiltinOptionsType,
+                                  addBuiltinOptions);
+    const std::vector<int32_t> divInputs{3, 2};
+    const std::vector<int32_t> divOutputs{4};
+    operators[1] = CreateOperator(flatBufferBuilder,
+                                  1,
+                                  flatBufferBuilder.CreateVector<int32_t>(divInputs.data(), divInputs.size()),
+                                  flatBufferBuilder.CreateVector<int32_t>(divOutputs.data(), divOutputs.size()),
+                                  divBuiltinOptionsType,
+                                  divBuiltinOptions);
+
+    const std::vector<int> subgraphInputs{0, 1, 2};
+    const std::vector<int> subgraphOutputs{4};
+    flatbuffers::Offset<SubGraph> subgraph =
+        CreateSubGraph(flatBufferBuilder,
+                       flatBufferBuilder.CreateVector(tensors.data(), tensors.size()),
+                       flatBufferBuilder.CreateVector<int32_t>(subgraphInputs.data(), subgraphInputs.size()),
+                       flatBufferBuilder.CreateVector<int32_t>(subgraphOutputs.data(), subgraphOutputs.size()),
+                       flatBufferBuilder.CreateVector(operators.data(), operators.size()));
+
+    flatbuffers::Offset<flatbuffers::String> modelDescription =
+        flatBufferBuilder.CreateString("ArmnnDelegate: Add and Div Operator Model");
+
+    std::array<flatbuffers::Offset<OperatorCode>, 2> codes;
+    codes[0] = CreateOperatorCode(flatBufferBuilder, tflite::BuiltinOperator_ADD);
+    codes[1] = CreateOperatorCode(flatBufferBuilder, tflite::BuiltinOperator_DIV);
+
+    flatbuffers::Offset<Model> flatbufferModel =
+        CreateModel(flatBufferBuilder,
+                    TFLITE_SCHEMA_VERSION,
+                    flatBufferBuilder.CreateVector(codes.data(), codes.size()),
+                    flatBufferBuilder.CreateVector(&subgraph, 1),
+                    modelDescription,
+                    flatBufferBuilder.CreateVector(buffers.data(), buffers.size()));
+
+    flatBufferBuilder.Finish(flatbufferModel);
+
+    return std::vector<char>(flatBufferBuilder.GetBufferPointer(),
+                             flatBufferBuilder.GetBufferPointer() + flatBufferBuilder.GetSize());
+}
+
+void ReduceFp32ToBf16TestImpl()
+{
+    using namespace tflite;
+    // Set input data
+    std::vector<int32_t> inputShape{ 1, 5, 5, 1 };
+    std::vector<int32_t> filterShape{ 1, 3, 3, 1 };
+    std::vector<int32_t> biasShape{ 1 };
+    std::vector<int32_t> outputShape{ 1, 3, 3, 1 };
+
+    std::vector<float> inputValues =
+        {
+            1, 5, 2, 3, 5,
+            8, 7, 3, 6, 3,
+            3, 3, 9, 1, 9,
+            4, 1, 8, 1, 3,
+            6, 8, 1, 9, 2
+        };
+
+    std::vector<float> filterValues =
+        {
+            4, 5, 6,
+            0, 0, 0,
+            3, 2, 1
+        };
+
+    std::vector<float> biasValues = { 5 };
+
+    std::vector<float> expectedResult =
+        {
+            28, 38, 29,
+            96, 104, 53,
+            31, 55, 24
+        };
+
+    tflite::Padding padding = Padding_SAME;
+
+    std::vector<char> modelBuffer;
+    modelBuffer = CreateConv2dTfLiteModel<float>(BuiltinOperator_CONV_2D,
+                                                 ::tflite::TensorType_FLOAT32,
+                                                 2,
+                                                 2,
+                                                 1,
+                                                 1,
+                                                 padding,
+                                                 ActivationFunctionType_NONE,
+                                                 inputShape,
+                                                 filterShape,
+                                                 biasShape,
+                                                 outputShape,
+                                                 filterValues,
+                                                 biasValues);
+
+
+    const Model* tfLiteModel = GetModel(modelBuffer.data());
+    // Create TfLite Interpreters
+    std::unique_ptr<Interpreter> armnnDelegateInterpreter;
+    CHECK(InterpreterBuilder(tfLiteModel, ::tflite::ops::builtin::BuiltinOpResolver())
+          (&armnnDelegateInterpreter) == kTfLiteOk);
+    CHECK(armnnDelegateInterpreter != nullptr);
+    CHECK(armnnDelegateInterpreter->AllocateTensors() == kTfLiteOk);
+
+    // Create the Armnn Delegate
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<armnn::BackendOptions> backendOptions;
+
+    // Enable debug with BF16 enabled
+    armnn::OptimizerOptions optimizerOptions(false, true, true, false);
+
+    armnnDelegate::DelegateOptions delegateOptions(backends, optimizerOptions);
+    std::unique_ptr<TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+        theArmnnDelegate(armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
+                         armnnDelegate::TfLiteArmnnDelegateDelete);
+    CHECK(theArmnnDelegate != nullptr);
+    // Modify armnnDelegateInterpreter to use armnnDelegate
+    CHECK(armnnDelegateInterpreter->ModifyGraphWithDelegate(theArmnnDelegate.get()) == kTfLiteOk);
+
+    // Set input data
+    armnnDelegate::FillInput(armnnDelegateInterpreter, 0, inputValues);
+
+    // Run EnqueueWorkload
+    CHECK(armnnDelegateInterpreter->Invoke() == kTfLiteOk);
+
+    // Compare output data
+    auto armnnDelegateOutputId = armnnDelegateInterpreter->outputs()[0];
+    auto armnnDelegateOutputData = armnnDelegateInterpreter->typed_tensor<float>(armnnDelegateOutputId);
+    armnnDelegate::CompareData(expectedResult.data(), armnnDelegateOutputData, expectedResult.size());
+    armnnDelegateInterpreter.reset(nullptr);
+}
+
+template <typename T>
+void DelegateOptionTest(tflite::TensorType tensorType,
+                        const std::vector<armnn::BackendId>& backends,
+                        std::vector<int32_t>& tensorShape,
+                        std::vector<T>& input0Values,
+                        std::vector<T>& input1Values,
+                        std::vector<T>& input2Values,
+                        std::vector<T>& expectedOutputValues,
+                        const armnnDelegate::DelegateOptions& delegateOptions,
+                        float quantScale = 1.0f,
+                        int quantOffset  = 0)
+{
+    using namespace tflite;
+    std::vector<char> modelBuffer = CreateAddDivTfLiteModel(tensorType,
+                                                            tensorShape,
+                                                            quantScale,
+                                                            quantOffset);
+
+    const Model* tfLiteModel = GetModel(modelBuffer.data());
+    // Create TfLite Interpreters
+    std::unique_ptr<Interpreter> armnnDelegateInterpreter;
+    CHECK(InterpreterBuilder(tfLiteModel, ::tflite::ops::builtin::BuiltinOpResolver())
+              (&armnnDelegateInterpreter) == kTfLiteOk);
+    CHECK(armnnDelegateInterpreter != nullptr);
+    CHECK(armnnDelegateInterpreter->AllocateTensors() == kTfLiteOk);
+
+    std::unique_ptr<Interpreter> tfLiteInterpreter;
+    CHECK(InterpreterBuilder(tfLiteModel, ::tflite::ops::builtin::BuiltinOpResolver())
+              (&tfLiteInterpreter) == kTfLiteOk);
+    CHECK(tfLiteInterpreter != nullptr);
+    CHECK(tfLiteInterpreter->AllocateTensors() == kTfLiteOk);
+
+    // Create the ArmNN Delegate
+    std::unique_ptr<TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+        theArmnnDelegate(armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
+                         armnnDelegate::TfLiteArmnnDelegateDelete);
+    CHECK(theArmnnDelegate != nullptr);
+    // Modify armnnDelegateInterpreter to use armnnDelegate
+    CHECK(armnnDelegateInterpreter->ModifyGraphWithDelegate(theArmnnDelegate.get()) == kTfLiteOk);
+
+    // Set input data
+    armnnDelegate::FillInput(tfLiteInterpreter, 0, input0Values);
+    armnnDelegate::FillInput(tfLiteInterpreter, 1, input1Values);
+    armnnDelegate::FillInput(tfLiteInterpreter, 2, input2Values);
+
+    armnnDelegate::FillInput(armnnDelegateInterpreter, 0, input0Values);
+    armnnDelegate::FillInput(armnnDelegateInterpreter, 1, input1Values);
+    armnnDelegate::FillInput(armnnDelegateInterpreter, 2, input2Values);
+
+    // Run EnqueueWorkload
+    CHECK(tfLiteInterpreter->Invoke() == kTfLiteOk);
+    CHECK(armnnDelegateInterpreter->Invoke() == kTfLiteOk);
+
+    armnnDelegate::CompareOutputData<T>(tfLiteInterpreter, armnnDelegateInterpreter, tensorShape, expectedOutputValues);
+
+    armnnDelegateInterpreter.reset(nullptr);
+}
+
+} // anonymous namespace
\ No newline at end of file
diff --git a/delegate/src/test/TestUtils.cpp b/delegate/src/test/TestUtils.cpp
index 2787147639..1bc5786112 100644
--- a/delegate/src/test/TestUtils.cpp
+++ b/delegate/src/test/TestUtils.cpp
@@ -71,17 +71,22 @@ void CompareData(Half tensor1[], Half tensor2[], size_t tensorSize)
 
 void CompareData(TfLiteFloat16 tensor1[], TfLiteFloat16 tensor2[], size_t tensorSize)
 {
+    uint16_t tolerance = 1;
     for (size_t i = 0; i < tensorSize; i++)
     {
-        CHECK(tensor1[i].data == tensor2[i].data);
+        uint16_t tensor1Data = tensor1[i].data;
+        uint16_t tensor2Data = tensor2[i].data;
+        CHECK(std::max(tensor1Data, tensor2Data) - std::min(tensor1Data, tensor2Data) <= tolerance);
     }
 }
 
-void CompareData(TfLiteFloat16 tensor1[], Half tensor2[], size_t tensorSize)
-{
+void CompareData(TfLiteFloat16 tensor1[], Half tensor2[], size_t tensorSize) {
+    uint16_t tolerance = 1;
     for (size_t i = 0; i < tensorSize; i++)
     {
-        CHECK(tensor1[i].data == half_float::detail::float2half<std::round_indeterminate, float>(tensor2[i]));
+        uint16_t tensor1Data = tensor1[i].data;
+        uint16_t tensor2Data = half_float::detail::float2half<std::round_indeterminate, float>(tensor2[i]);
+        CHECK(std::max(tensor1Data, tensor2Data) - std::min(tensor1Data, tensor2Data) <= tolerance);
     }
 }
 
diff --git a/src/armnn/optimizations/ConvertFp32NetworkToBf16.hpp b/src/armnn/optimizations/ConvertFp32NetworkToBf16.hpp
index ca42cacb39..c45ab2cded 100644
--- a/src/armnn/optimizations/ConvertFp32NetworkToBf16.hpp
+++ b/src/armnn/optimizations/ConvertFp32NetworkToBf16.hpp
@@ -31,7 +31,8 @@ inline LayerT* ConvertWeight(Layer* l)
                                                                          info.GetNumElements(),
                                                                          newValues.data());
 
-            TensorInfo newInfo(info.GetShape(), DataType::BFloat16);
+            TensorInfo newInfo(info);
+            newInfo.SetDataType(DataType::BFloat16);
             ConstTensor newInput(newInfo, newValues);
             layer->m_Weight.reset(new ScopedCpuTensorHandle(newInput));
         }
-- 
cgit v1.2.1