From 81ec994a3ebc8ad02c4a622846cf64b70e1182bd Mon Sep 17 00:00:00 2001
From: Matthew Sloyan <matthew.sloyan@arm.com>
Date: Tue, 12 Oct 2021 10:26:30 +0100
Subject: IVGCVSW-6166 Add Support for Conv3d to TFLite Delegate

 * Conv3d is only correctly supported for external delegates
   from TF v2.6, as there was a breaking bug in v2.5.

Signed-off-by: Matthew Sloyan <matthew.sloyan@arm.com>
Change-Id: Ib7941307f4c7b0d3dbb7deaa5a90aceb63c1162f
---
 delegate/CMakeLists.txt                     |   1 +
 delegate/include/armnn_delegate.hpp         |   9 +
 delegate/src/Convolution.hpp                | 167 +++++++++++++++++
 delegate/src/DelegateUtils.hpp              |   5 +-
 delegate/src/armnn_delegate.cpp             |   9 +
 delegate/src/test/Convolution3dTest.cpp     | 273 ++++++++++++++++++++++++++++
 delegate/src/test/ConvolutionTestHelper.hpp | 258 ++++++++++++++++++++++++++
 docs/01_03_delegate.dox                     |   2 +
 8 files changed, 722 insertions(+), 2 deletions(-)
 create mode 100644 delegate/src/test/Convolution3dTest.cpp

diff --git a/delegate/CMakeLists.txt b/delegate/CMakeLists.txt
index 504256da80..0178594bfe 100644
--- a/delegate/CMakeLists.txt
+++ b/delegate/CMakeLists.txt
@@ -141,6 +141,7 @@ if(BUILD_UNIT_TESTS)
         src/test/ControlTest.cpp
         src/test/ControlTestHelper.hpp
         src/test/Convolution2dTest.cpp
+        src/test/Convolution3dTest.cpp
         src/test/ConvolutionTestHelper.hpp
         src/test/DelegateOptionsTest.cpp
         src/test/DelegateOptionsTestHelper.hpp
diff --git a/delegate/include/armnn_delegate.hpp b/delegate/include/armnn_delegate.hpp
index b213211ae9..8aaf255a9d 100644
--- a/delegate/include/armnn_delegate.hpp
+++ b/delegate/include/armnn_delegate.hpp
@@ -11,6 +11,15 @@
 #include <tensorflow/lite/c/builtin_op_data.h>
 #include <tensorflow/lite/c/common.h>
 #include <tensorflow/lite/minimal_logging.h>
+#include <tensorflow/lite/version.h>
+
+#if TF_MAJOR_VERSION > 2 || (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 3)
+#define ARMNN_POST_TFLITE_2_3
+#endif
+
+#if TF_MAJOR_VERSION > 2 || (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 5)
+#define ARMNN_POST_TFLITE_2_5
+#endif
 
 namespace armnnDelegate
 {
diff --git a/delegate/src/Convolution.hpp b/delegate/src/Convolution.hpp
index 96612e0214..a7d6c1de26 100644
--- a/delegate/src/Convolution.hpp
+++ b/delegate/src/Convolution.hpp
@@ -204,6 +204,168 @@ TfLiteStatus VisitConv2dOperator(DelegateData& delegateData,
 
 }
 
+// Conv3d is only correctly supported for external delegates from TF Lite v2.6, as there was a breaking bug in v2.5.
+#if defined(ARMNN_POST_TFLITE_2_5)
+TfLiteStatus VisitConv3dOperator(DelegateData& delegateData,
+                                 TfLiteContext* tfLiteContext,
+                                 TfLiteNode* tfLiteNode,
+                                 int nodeIndex,
+                                 int32_t operatorCode)
+{
+    auto numInputs = tfLiteNode->inputs->size;
+    if (numInputs < 2)
+    {
+        TF_LITE_MAYBE_KERNEL_LOG(
+                tfLiteContext, "TfLiteArmnnDelegate: Minimum number of inputs (%d != %d) in node #%d",
+                2, numInputs, nodeIndex);
+        return kTfLiteError;
+    }
+    TF_LITE_ENSURE_STATUS(ValidateNumOutputs(tfLiteContext, tfLiteNode, 1, nodeIndex));
+
+    armnn::Convolution3dDescriptor descriptor;
+    const auto params = reinterpret_cast<TfLiteConv3DParams*>(tfLiteNode->builtin_data);
+
+    bool biasEnabled = tfLiteNode->inputs->size == 3 ? true : false;
+    descriptor.m_BiasEnabled = biasEnabled;
+    descriptor.m_DataLayout = armnn::DataLayout::NDHWC;
+    descriptor.m_StrideX = NonNegative(params->stride_width, nodeIndex);
+    descriptor.m_StrideY = NonNegative(params->stride_height, nodeIndex);
+    descriptor.m_StrideZ = NonNegative(params->stride_depth, nodeIndex);
+    descriptor.m_DilationX = NonNegative(params->dilation_width_factor, nodeIndex);
+    descriptor.m_DilationY = NonNegative(params->dilation_height_factor, nodeIndex);
+    descriptor.m_DilationZ = NonNegative(params->dilation_depth_factor, nodeIndex);
+
+    const TfLiteTensor* tfLiteTensors = tfLiteContext->tensors;
+    const TfLiteTensor& tfLiteInputTensor = tfLiteTensors[tfLiteNode->inputs->data[0]];
+    if (!IsValid(tfLiteContext, tfLiteInputTensor, operatorCode, nodeIndex))
+    {
+        return kTfLiteError;
+    }
+
+    const TfLiteTensor& tfLiteOutputTensor = tfLiteTensors[tfLiteNode->outputs->data[0]];
+    if (!IsValid(tfLiteContext, tfLiteOutputTensor, operatorCode, nodeIndex))
+    {
+        return kTfLiteError;
+    }
+
+    const TfLiteTensor& tfLiteFilterTensor = tfLiteTensors[tfLiteNode->inputs->data[1]];
+    if (!IsValid(tfLiteContext, tfLiteFilterTensor, operatorCode, nodeIndex))
+    {
+        return kTfLiteError;
+    }
+
+    const armnn::TensorInfo& inputTensorInfo  = GetTensorInfoForTfLiteTensor(tfLiteInputTensor);
+    const armnn::TensorInfo& outputTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteOutputTensor);
+
+    armnn::TensorInfo filterTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteFilterTensor);
+
+    armnn::TensorInfo biasTensorInfo;
+    if(biasEnabled)
+    {
+        const TfLiteTensor& tfLiteBiasTensor = tfLiteTensors[tfLiteNode->inputs->data[2]];
+        if (!IsValid(tfLiteContext, tfLiteBiasTensor, operatorCode, nodeIndex))
+        {
+            return kTfLiteError;
+        }
+        biasTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteBiasTensor);
+    }
+    else
+    {
+        biasTensorInfo = armnn::TensorInfo(armnn::TensorShape({1}), GetDataType(tfLiteInputTensor));
+    }
+
+    armnn::Optional<armnn::TensorInfo> optionalBiasInfo(biasTensorInfo);
+
+    // TfLite uses NDHWC tensors
+    const unsigned int inputDepth  = inputTensorInfo.GetShape()[1];
+    const unsigned int inputHeight = inputTensorInfo.GetShape()[2];
+    const unsigned int inputWidth  = inputTensorInfo.GetShape()[3];
+
+    // Assuming the filter is DHWIO : Depth, Height, Width, OutputChannels, InputChannels
+    const unsigned int filterDepth  = filterTensorInfo.GetShape()[0];
+    const unsigned int filterHeight = filterTensorInfo.GetShape()[1];
+    const unsigned int filterWidth  = filterTensorInfo.GetShape()[2];
+
+    // Calculate padding
+    CalcPadding(inputDepth, filterDepth, descriptor.m_StrideZ, descriptor.m_DilationZ,
+                descriptor.m_PadFront, descriptor.m_PadBack, params->padding);
+    CalcPadding(inputHeight, filterHeight, descriptor.m_StrideY, descriptor.m_DilationY,
+                descriptor.m_PadTop, descriptor.m_PadBottom, params->padding);
+    CalcPadding(inputWidth, filterWidth, descriptor.m_StrideX, descriptor.m_DilationX,
+                descriptor.m_PadLeft, descriptor.m_PadRight, params->padding);
+
+    // If the m_Network is a nullptr, this signals that a prerequisite TfLite callback is required to clarify the
+    // support for the operator
+    // If supported, VisitConvolutionOperator will be called again to add the layer to the network as seen below.
+    if (!delegateData.m_Network)
+    {
+        bool isSupported = false;
+        FORWARD_LAYER_SUPPORT_FUNC(__func__,
+                                   tfLiteContext,
+                                   IsConvolution3dSupported,
+                                   delegateData.m_Backends,
+                                   isSupported,
+                                   inputTensorInfo,
+                                   outputTensorInfo,
+                                   descriptor,
+                                   filterTensorInfo,
+                                   optionalBiasInfo);
+        return isSupported ? kTfLiteOk : kTfLiteError;
+    }
+
+    armnn::IConnectableLayer* layer =  delegateData.m_Network->AddConvolution3dLayer(descriptor);
+    ARMNN_ASSERT(layer != nullptr);
+
+    // Add a constant layer for weights and biases if inputs are constant,
+    // which are connected to the Convolution3d layer as inputs.
+    if (tflite::IsConstantTensor(&tfLiteFilterTensor))
+    {
+        auto filter = CreateConstTensor(&tfLiteFilterTensor,
+                                        filterTensorInfo,
+                                        armnn::Optional<armnn::PermutationVector&>());
+
+        armnn::IConnectableLayer* weightsLayer = delegateData.m_Network->AddConstantLayer(filter);
+        ARMNN_ASSERT(weightsLayer != nullptr);
+
+        weightsLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1u));
+        weightsLayer->GetOutputSlot(0).SetTensorInfo(filterTensorInfo);
+    }
+
+    if(biasEnabled)
+    {
+        const TfLiteTensor& tfLiteBiasTensor = tfLiteTensors[tfLiteNode->inputs->data[2]];
+        if(tflite::IsConstantTensor(&tfLiteBiasTensor))
+        {
+            auto biases = CreateConstTensor(&tfLiteBiasTensor,
+                                            biasTensorInfo,
+                                            armnn::Optional<armnn::PermutationVector&>());
+
+            armnn::IConnectableLayer* biasLayer = delegateData.m_Network->AddConstantLayer(biases);
+            ARMNN_ASSERT(biasLayer != nullptr);
+
+            biasLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(2u));
+            biasLayer->GetOutputSlot(0).SetTensorInfo(biasTensorInfo);
+        }
+    }
+
+    armnn::IOutputSlot& outputSlot = layer->GetOutputSlot(0);
+    outputSlot.SetTensorInfo(outputTensorInfo);
+
+    Connect(layer, tfLiteNode, delegateData);
+
+    auto* tfLiteNodeParameters = reinterpret_cast<TfLiteConv3DParams*>(tfLiteNode->builtin_data);
+    if (!tfLiteNodeParameters)
+    {
+        // No Activation
+        return kTfLiteOk;
+    }
+
+    // Check activation
+    TfLiteFusedActivation activationType = tfLiteNodeParameters->activation;
+    return FusedActivation(tfLiteContext, tfLiteNode, activationType, layer, 0, delegateData);
+}
+#endif
+
 TfLiteStatus VisitDepthwiseConv2dOperator(DelegateData& delegateData,
                                           TfLiteContext* tfLiteContext,
                                           TfLiteNode* tfLiteNode,
@@ -581,6 +743,11 @@ TfLiteStatus VisitConvolutionOperator(DelegateData& delegateData,
     {
         case kTfLiteBuiltinConv2d:
             return VisitConv2dOperator(delegateData, tfLiteContext, tfLiteNode, nodeIndex, operatorCode);
+// Conv3d is only correctly supported for external delegates from TF Lite v2.6, as there was a breaking bug in v2.5.
+#if defined(ARMNN_POST_TFLITE_2_5)
+        case kTfLiteBuiltinConv3d:
+            return VisitConv3dOperator(delegateData, tfLiteContext, tfLiteNode, nodeIndex, operatorCode);
+#endif
         case kTfLiteBuiltinDepthwiseConv2d:
             return VisitDepthwiseConv2dOperator(delegateData, tfLiteContext, tfLiteNode, nodeIndex, operatorCode);
         case kTfLiteBuiltinTransposeConv:
diff --git a/delegate/src/DelegateUtils.hpp b/delegate/src/DelegateUtils.hpp
index e408dba138..45174458aa 100644
--- a/delegate/src/DelegateUtils.hpp
+++ b/delegate/src/DelegateUtils.hpp
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include <armnn_delegate.hpp>
+
 #include <armnn/ArmNN.hpp>
 #include <armnn/BackendHelper.hpp>
 #include <armnn/utility/Assert.hpp>
@@ -16,7 +18,6 @@
 #include <tensorflow/lite/c/builtin_op_data.h>
 #include <tensorflow/lite/c/common.h>
 #include <tensorflow/lite/minimal_logging.h>
-#include <tensorflow/lite/version.h>
 
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -296,7 +297,7 @@ TfLiteStatus FusedActivation(TfLiteContext* tfLiteContext,
             break;
         }
 // The name of kTfLiteActRelu1 changed after TF Lite v2.3
-#if TF_MAJOR_VERSION > 2 || (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION > 3)
+#if defined(ARMNN_POST_TFLITE_2_3)
         case kTfLiteActReluN1To1:
 #else
         case kTfLiteActRelu1:
diff --git a/delegate/src/armnn_delegate.cpp b/delegate/src/armnn_delegate.cpp
index 2ede23c12b..5fbc920a1e 100644
--- a/delegate/src/armnn_delegate.cpp
+++ b/delegate/src/armnn_delegate.cpp
@@ -522,6 +522,15 @@ TfLiteStatus ArmnnSubgraph::VisitNode(DelegateData& delegateData,
                                             tfLiteNode,
                                             nodeIndex,
                                             kTfLiteBuiltinConv2d);
+// Conv3d is only correctly supported for external delegates from TF Lite v2.6, as there was a breaking bug in v2.5.
+#if defined(ARMNN_POST_TFLITE_2_5)
+        case kTfLiteBuiltinConv3d:
+            return VisitConvolutionOperator(delegateData,
+                                            tfLiteContext,
+                                            tfLiteNode,
+                                            nodeIndex,
+                                            kTfLiteBuiltinConv3d);
+#endif
         case kTfLiteBuiltinDepthToSpace:
             return VisitDepthToSpaceOperator(delegateData,
                                              tfLiteContext,
diff --git a/delegate/src/test/Convolution3dTest.cpp b/delegate/src/test/Convolution3dTest.cpp
new file mode 100644
index 0000000000..6caa7ea18f
--- /dev/null
+++ b/delegate/src/test/Convolution3dTest.cpp
@@ -0,0 +1,273 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ConvolutionTestHelper.hpp"
+
+#include <armnn_delegate.hpp>
+
+#include <flatbuffers/flatbuffers.h>
+#include <tensorflow/lite/interpreter.h>
+#include <tensorflow/lite/kernels/register.h>
+#include <tensorflow/lite/model.h>
+#include <tensorflow/lite/schema/schema_generated.h>
+
+#include <doctest/doctest.h>
+
+namespace armnnDelegate
+{
+
+// Conv3d is currently only supports Float32 inputs, filter, bias and outputs in TFLite.
+// Conv3d is only correctly supported for external delegates from TF Lite v2.6, as there was a breaking bug in v2.5.
+#if defined(ARMNN_POST_TFLITE_2_5)
+
+// Create a vector from 0 to size divided to create smaller floating point values.
+template <typename T>
+std::vector<T> CreateFloatData(int32_t size, float divisor)
+{
+    std::vector<float> data;
+    for (int32_t i = 0; i < size; ++i)
+    {
+        float value = static_cast<float>(i);
+        data.push_back(value/divisor);
+    }
+    return data;
+}
+
+void Conv3DWithBiasesSimpleWithPaddingFp32Test(std::vector<armnn::BackendId>& backends)
+{
+    // Set input data
+    std::vector<int32_t> inputShape { 1, 2, 2, 2, 1 };
+    std::vector<int32_t> filterShape { 2, 2, 2, 1, 1 };
+    std::vector<int32_t> biasShape { 1 };
+    std::vector<int32_t> outputShape { 1, 2, 2, 2, 1 };
+
+    static std::vector<float> inputValues =
+    {
+        1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f
+    };
+
+    std::vector<float> filterValues =
+    {
+        2.f,1.f, 1.f,0.f, 0.f,1.f, 1.f,1.f
+    };
+
+    std::vector<float> biasValues = { 5.f };
+
+    std::vector<float> expectedOutputValues =
+    {
+       33.f, 21.f, 23.f, 13.f, 28.f, 25.f, 27.f, 21.f
+    };
+
+    Convolution3dTest<float>(tflite::BuiltinOperator_CONV_3D,
+                             ::tflite::TensorType_FLOAT32,
+                             { 1, 1, 1 }, // strideX, strideY, strideZ
+                             { 1, 1, 1 }, // dilationX, dilationY, dilationZ
+                             tflite::Padding_SAME,
+                             tflite::ActivationFunctionType_NONE,
+                             backends,
+                             inputShape,
+                             filterShape,
+                             outputShape,
+                             inputValues,
+                             filterValues,
+                             expectedOutputValues,
+                             biasShape,
+                             biasValues);
+}
+
+void Conv3DWithBiasesStridesFp32Test(std::vector<armnn::BackendId>& backends)
+{
+    std::vector<int32_t> inputShape { 1, 3, 10, 10, 1 };
+    std::vector<int32_t> filterShape { 3, 5, 5, 1, 1 };
+    std::vector<int32_t> biasShape { 1 };
+    std::vector<int32_t> outputShape { 1, 1, 3, 3, 1 };
+
+    std::vector<float> inputValues = CreateFloatData<float>(300, 1.0f);
+
+    std::vector<float> filterValues =
+    {
+        1.f, 1.f, 1.f, 1.f, 1.f,
+        1.f, 1.f, 1.f, 1.f, 1.f,
+        1.f, 1.f, 1.f, 1.f, 1.f,
+        1.f, 1.f, 1.f, 1.f, 1.f,
+        1.f, 1.f, 1.f, 1.f, 1.f,
+
+        0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f,
+
+        2.f, 2.f, 2.f, 2.f, 2.f,
+        2.f, 2.f, 2.f, 2.f, 2.f,
+        2.f, 2.f, 2.f, 2.f, 2.f,
+        2.f, 2.f, 2.f, 2.f, 2.f,
+        2.f, 2.f, 2.f, 2.f, 2.f
+    };
+
+    std::vector<float> biasValues = { 10.f };
+
+    std::vector<float> expectedOutputValues =
+    {
+        11660.f, 11810.f, 11960.f,
+
+        13160.f, 13310.f, 13460.f,
+
+        14660.f, 14810.f, 14960.f
+    };
+
+    Convolution3dTest<float>(tflite::BuiltinOperator_CONV_3D,
+                             ::tflite::TensorType_FLOAT32,
+                             { 2, 2, 2 }, // strideX, strideY, strideZ
+                             { 1, 1, 1 }, // dilationX, dilationY, dilationZ
+                             tflite::Padding_VALID,
+                             tflite::ActivationFunctionType_NONE,
+                             backends,
+                             inputShape,
+                             filterShape,
+                             outputShape,
+                             inputValues,
+                             filterValues,
+                             expectedOutputValues,
+                             biasShape,
+                             biasValues);
+}
+
+
+void Conv3DWithBiasesDilationFp32Test(std::vector<armnn::BackendId>& backends)
+{
+    std::vector<int32_t> inputShape { 1, 5, 5, 5, 2 };
+    std::vector<int32_t> filterShape { 2, 2, 2, 2, 2 };
+    std::vector<int32_t> biasShape { 2 };
+    std::vector<int32_t> outputShape { 1, 2, 2, 2, 2 };
+
+    std::vector<float> inputValues = CreateFloatData<float>(250, 1.0f);
+
+    std::vector<float> filterValues =
+    {
+        -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f,  1.f,   1.f,  1.f,  -1.f, -1.f,
+         1.f,  1.f,  -1.f,  1.f,  -1.f,  1.f,  -1.f,  1.f,  -1.f, -1.f,  -1.f,  1.f,  -1.f,  1.f,  -1.f,  1.f,
+    };
+
+    std::vector<float> biasValues = { 0.f, 2.f };
+
+    // Since the dilation rate is 3 this will dilate the kernel to be 4x4,
+    // therefore the output will be 2x2
+    std::vector<float> expectedOutputValues =
+    {
+        -1124.f, 976.f,
+        -1148.f, 980.f,
+
+        -1244.f, 996.f,
+        -1268.f, 1000.f,
+
+        -1724.f, 1076.f,
+        -1748.f, 1080.f,
+
+        -1844.f, 1096.f,
+        -1868.f, 1100.f
+    };
+
+    Convolution3dTest<float>(tflite::BuiltinOperator_CONV_3D,
+                             ::tflite::TensorType_FLOAT32,
+                             { 1, 1, 1 }, // strideX, strideY, strideZ
+                             { 3, 3, 3 }, // dilationX, dilationY, dilationZ
+                             tflite::Padding_VALID,
+                             tflite::ActivationFunctionType_NONE,
+                             backends,
+                             inputShape,
+                             filterShape,
+                             outputShape,
+                             inputValues,
+                             filterValues,
+                             expectedOutputValues,
+                             biasShape,
+                             biasValues);
+}
+
+void Conv3DFp32SmallTest(std::vector<armnn::BackendId>& backends)
+{
+    std::vector<int32_t> inputShape { 1, 3, 10, 10, 1 };
+    std::vector<int32_t> filterShape { 3, 3, 3, 1, 1 };
+    std::vector<int32_t> biasShape { 1 };
+    std::vector<int32_t> outputShape { 1, 1, 4, 4, 1 };
+
+    std::vector<float> inputValues = CreateFloatData<float>(300, 100.0f);
+
+    std::vector<float> filterValues =
+    {
+         0.125977f,  0.150391f,  0.101562f,
+         0.0585938f, 0.0864258f, 0.043457f,
+         0.034668f,  0.0322266f, 0.0385742f,
+
+         0.125977f,  0.150391f, -0.101562f,
+        -0.0585938f,-0.0864258f,-0.043457f,
+        -0.0104630f, 0.0154114f, 0.0013768f,
+
+         0.0344238f, 0.035644f,  0.0495605f,
+         0.0683594f, 0.099121f, -0.0461426f,
+        -0.0996094f,-0.126953f, -0.043457f,
+    };
+
+    std::vector<float> biasValues = { 0 };
+
+    std::vector<float> expectedOutputValues =
+    {
+        -0.08156067f, -0.06891209f, -0.05589598f, -0.04310101f,
+         0.04584253f,  0.05855697f,  0.07129729f,  0.08325434f,
+         0.17304349f,  0.18521416f,  0.19818866f,  0.21096253f,
+         0.29965734f,  0.312698f,    0.32547557f,  0.33818722f
+    };
+
+    Convolution3dTest<float>(tflite::BuiltinOperator_CONV_3D,
+                             ::tflite::TensorType_FLOAT32,
+                             { 2, 2, 2 }, // strideX, strideY, strideZ
+                             { 1, 1, 1 }, // dilationX, dilationY, dilationZ
+                             tflite::Padding_VALID,
+                             tflite::ActivationFunctionType_NONE,
+                             backends,
+                             inputShape,
+                             filterShape,
+                             outputShape,
+                             inputValues,
+                             filterValues,
+                             expectedOutputValues,
+                             biasShape,
+                             biasValues);
+}
+
+TEST_SUITE("Convolution3dTest_CpuRefTests")
+{
+
+TEST_CASE ("Conv3DWithBiasesSimpleWithPadding_Fp32_CpuRef_Test")
+{
+    std::vector <armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    Conv3DWithBiasesSimpleWithPaddingFp32Test(backends);
+}
+
+TEST_CASE ("Conv3DWithBiasesStrides_Fp32_CpuRef_Test")
+{
+    std::vector <armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    Conv3DWithBiasesStridesFp32Test(backends);
+}
+
+TEST_CASE ("Conv3DWithBiasesDilation_Fp32_CpuRef_Test")
+{
+    std::vector <armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    Conv3DWithBiasesDilationFp32Test(backends);
+}
+
+TEST_CASE ("Conv3DFp32Small_Fp32_CpuRef_Test")
+{
+    std::vector <armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    Conv3DFp32SmallTest(backends);
+}
+
+
+} //End of TEST_SUITE("Convolution3dTest_CpuRefTests")
+
+#endif
+
+} // namespace armnnDelegate
\ No newline at end of file
diff --git a/delegate/src/test/ConvolutionTestHelper.hpp b/delegate/src/test/ConvolutionTestHelper.hpp
index 1b33c1d74d..ce1f951d21 100644
--- a/delegate/src/test/ConvolutionTestHelper.hpp
+++ b/delegate/src/test/ConvolutionTestHelper.hpp
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include "TestUtils.hpp"
+
 #include <armnn_delegate.hpp>
 
 #include <flatbuffers/flatbuffers.h>
@@ -221,6 +223,7 @@ void ConvolutionTest(tflite::BuiltinOperator convolutionOperatorCode,
     using namespace tflite;
 
     std::vector<char> modelBuffer;
+
     modelBuffer = CreateConv2dTfLiteModel(convolutionOperatorCode,
                                           tensorType,
                                           strideX,
@@ -301,6 +304,261 @@ void ConvolutionTest(tflite::BuiltinOperator convolutionOperatorCode,
     }
 }
 
+// Conv3d is only correctly supported for external delegates from TF Lite v2.6, as there was a breaking bug in v2.5.
+#if defined(ARMNN_POST_TFLITE_2_5)
+template <typename T, typename B = float>
+std::vector<char> CreateConv3dTfLiteModel(tflite::BuiltinOperator convolutionOperatorCode,
+                                          tflite::TensorType tensorType,
+                                          std::vector<uint32_t> strides,
+                                          std::vector<uint32_t> dilation,
+                                          tflite::Padding padding,
+                                          tflite::ActivationFunctionType fused_activation_function,
+                                          const std::vector<int32_t>& inputTensorShape,
+                                          const std::vector<int32_t>& filterTensorShape,
+                                          const std::vector<int32_t>& biasTensorShape,
+                                          const std::vector<int32_t>& outputTensorShape,
+                                          const std::vector<T>& filterData,
+                                          const std::vector<B>& biasData,
+                                          const std::vector<float> biasScales = {1.0f},
+                                          const std::vector<int64_t> biasOffsets = {0},
+                                          const std::vector<float> filterScales = {1.0f},
+                                          const std::vector<int64_t> filterOffsets = {0},
+                                          float outputQuantScale = 2.0f,
+                                          int outputQuantOffset = 0,
+                                          float quantScale = 1.0f,
+                                          int quantOffset = 0,
+                                          int32_t depth_multiplier = 1,
+                                          int32_t filterQuantizationDim = 0)
+{
+    using namespace tflite;
+    flatbuffers::FlatBufferBuilder flatBufferBuilder;
+
+    std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers;
+    buffers[0] = CreateBuffer(flatBufferBuilder, flatBufferBuilder.CreateVector({}));
+    buffers[1] = CreateBuffer(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector(reinterpret_cast<const uint8_t*>(filterData.data()),
+                                                             sizeof(T) * filterData.size()));
+
+    buffers[2] = CreateBuffer(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector(reinterpret_cast<const uint8_t*>(biasData.data()),
+                                                             sizeof(B) * biasData.size()));
+
+    auto quantizationParameters =
+            CreateQuantizationParameters(flatBufferBuilder,
+                                         0,
+                                         0,
+                                         flatBufferBuilder.CreateVector<float>({ quantScale }),
+                                         flatBufferBuilder.CreateVector<int64_t>({ quantOffset }));
+    auto outputQuantizationParameters =
+            CreateQuantizationParameters(flatBufferBuilder,
+                                         0,
+                                         0,
+                                         flatBufferBuilder.CreateVector<float>({ outputQuantScale }),
+                                         flatBufferBuilder.CreateVector<int64_t>({ outputQuantOffset }));
+
+    auto filterQuantizationParameters =
+            CreateQuantizationParameters(flatBufferBuilder,
+                                         0,
+                                         0,
+                                         flatBufferBuilder.CreateVector<float>(filterScales),
+                                         flatBufferBuilder.CreateVector<int64_t>(filterOffsets),
+                                         tflite::QuantizationDetails_NONE,
+                                         0,
+                                         filterQuantizationDim);
+
+    auto biasQuantizationParameters =
+            CreateQuantizationParameters(flatBufferBuilder,
+                                         0,
+                                         0,
+                                         flatBufferBuilder.CreateVector<float>(biasScales),
+                                         flatBufferBuilder.CreateVector<int64_t>(biasOffsets));
+
+    std::array<flatbuffers::Offset<Tensor>, 4> tensors;
+    tensors[0] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(inputTensorShape.data(),
+                                                                      inputTensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("input"),
+                              quantizationParameters);
+    tensors[1] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(filterTensorShape.data(),
+                                                                      filterTensorShape.size()),
+                              tensorType,
+                              1,
+                              flatBufferBuilder.CreateString("filter"),
+                              filterQuantizationParameters);
+
+    auto biasTensorType = ::tflite::TensorType_FLOAT32;
+    if (tensorType == ::tflite::TensorType_INT8 || tensorType == ::tflite::TensorType_UINT8)
+    {
+        biasTensorType = ::tflite::TensorType_INT32;
+    }
+    tensors[2] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(biasTensorShape.data(), biasTensorShape.size()),
+                              biasTensorType,
+                              2,
+                              flatBufferBuilder.CreateString("bias"),
+                              biasQuantizationParameters);
+    tensors[3] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(outputTensorShape.data(),
+                                                                      outputTensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("output"),
+                              outputQuantizationParameters);
+
+    tflite::BuiltinOptions operatorBuiltinOptionsType = tflite::BuiltinOptions_Conv3DOptions;
+    flatbuffers::Offset<void> operatorBuiltinOptions = CreateConv3DOptions(flatBufferBuilder,
+                                                                           padding,
+                                                                           strides[2], // Depth
+                                                                           strides[0], // Width
+                                                                           strides[1], // Height
+                                                                           fused_activation_function,
+                                                                           dilation[2],
+                                                                           dilation[0],
+                                                                           dilation[1]).Union();
+
+    // Create operator
+    const std::vector<int> operatorInputs{0, 1, 2};
+    const std::vector<int> operatorOutputs{3};
+    flatbuffers::Offset <Operator> convolutionOperator =
+            CreateOperator(flatBufferBuilder,
+                           0,
+                           flatBufferBuilder.CreateVector<int32_t>(operatorInputs.data(), operatorInputs.size()),
+                           flatBufferBuilder.CreateVector<int32_t>(operatorOutputs.data(), operatorOutputs.size()),
+                           operatorBuiltinOptionsType,
+                           operatorBuiltinOptions);
+
+    const std::vector<int> subgraphInputs{0, 1, 2};
+    const std::vector<int> subgraphOutputs{3};
+    flatbuffers::Offset <SubGraph> subgraph =
+            CreateSubGraph(flatBufferBuilder,
+                           flatBufferBuilder.CreateVector(tensors.data(), tensors.size()),
+                           flatBufferBuilder.CreateVector<int32_t>(subgraphInputs.data(), subgraphInputs.size()),
+                           flatBufferBuilder.CreateVector<int32_t>(subgraphOutputs.data(), subgraphOutputs.size()),
+                           flatBufferBuilder.CreateVector(&convolutionOperator, 1));
+
+    flatbuffers::Offset <flatbuffers::String> modelDescription =
+            flatBufferBuilder.CreateString("ArmnnDelegate: Convolution 3d Operator Model");
+
+    // If using an operator with a code greater than 127 then the enum value should be passed as the fifth
+    // parameter rather than the second like in other tests.
+    flatbuffers::Offset <OperatorCode> operatorCode =
+            CreateOperatorCode(flatBufferBuilder, 0, 0, 1, tflite::BuiltinOperator_CONV_3D);
+
+    flatbuffers::Offset <Model> flatbufferModel =
+            CreateModel(flatBufferBuilder,
+                        TFLITE_SCHEMA_VERSION,
+                        flatBufferBuilder.CreateVector(&operatorCode, 1),
+                        flatBufferBuilder.CreateVector(&subgraph, 1),
+                        modelDescription,
+                        flatBufferBuilder.CreateVector(buffers.data(), buffers.size()));
+
+    flatBufferBuilder.Finish(flatbufferModel);
+
+    return std::vector<char>(flatBufferBuilder.GetBufferPointer(),
+                             flatBufferBuilder.GetBufferPointer() + flatBufferBuilder.GetSize());
+}
+
+template <typename T, typename B = float>
+void Convolution3dTest(tflite::BuiltinOperator convolutionOperatorCode,
+                       tflite::TensorType tensorType,
+                       std::vector<uint32_t> strides,
+                       std::vector<uint32_t> dilation,
+                       tflite::Padding padding,
+                       tflite::ActivationFunctionType fused_activation_function,
+                       std::vector<armnn::BackendId>& backends,
+                       std::vector<int32_t>& inputShape,
+                       std::vector<int32_t>& filterShape,
+                       std::vector<int32_t>& outputShape,
+                       std::vector<T>& inputValues,
+                       std::vector<T>& filterValues,
+                       std::vector<T>& expectedOutputValues,
+                       const std::vector<int32_t>& biasShape = {},
+                       const std::vector<B>& biasValues = {},
+                       const std::vector<float> biasScales = {1.0f},
+                       const std::vector<int64_t> biasOffsets = {0},
+                       const std::vector<float> filterScales = {1.0f},
+                       const std::vector<int64_t> filterOffsets = {0},
+                       float outputQuantScale = 2.0f,
+                       int outputQuantOffset = 0,
+                       float quantScale = 1.0f,
+                       int quantOffset = 0,
+                       int32_t depth_multiplier = 1,
+                       int32_t filterQuantizationDim = 3)
+{
+    using namespace tflite;
+
+    std::vector<char> modelBuffer;
+    modelBuffer = CreateConv3dTfLiteModel(convolutionOperatorCode,
+                                          tensorType,
+                                          strides,
+                                          dilation,
+                                          padding,
+                                          fused_activation_function,
+                                          inputShape,
+                                          filterShape,
+                                          biasShape,
+                                          outputShape,
+                                          filterValues,
+                                          biasValues,
+                                          biasScales,
+                                          biasOffsets,
+                                          filterScales,
+                                          filterOffsets,
+                                          outputQuantScale,
+                                          outputQuantOffset,
+                                          quantScale,
+                                          quantOffset,
+                                          depth_multiplier,
+                                          filterQuantizationDim);
+
+    const Model* tfLiteModel = GetModel(modelBuffer.data());
+
+    // Create TfLite Interpreters
+    std::unique_ptr<Interpreter> armnnDelegateInterpreter;
+    CHECK(InterpreterBuilder(tfLiteModel, ::tflite::ops::builtin::BuiltinOpResolver())
+                  (&armnnDelegateInterpreter) == kTfLiteOk);
+    CHECK(armnnDelegateInterpreter != nullptr);
+    CHECK(armnnDelegateInterpreter->AllocateTensors() == kTfLiteOk);
+
+    std::unique_ptr<Interpreter> tfLiteInterpreter;
+    CHECK(InterpreterBuilder(tfLiteModel, ::tflite::ops::builtin::BuiltinOpResolver())
+                  (&tfLiteInterpreter) == kTfLiteOk);
+    CHECK(tfLiteInterpreter != nullptr);
+    CHECK(tfLiteInterpreter->AllocateTensors() == kTfLiteOk);
+
+    // Create the ArmNN Delegate
+    armnnDelegate::DelegateOptions delegateOptions(backends);
+    std::unique_ptr<TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+            theArmnnDelegate(armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
+                             armnnDelegate::TfLiteArmnnDelegateDelete);
+    CHECK(theArmnnDelegate != nullptr);
+
+    // Modify armnnDelegateInterpreter to use armnnDelegate
+    CHECK(armnnDelegateInterpreter->ModifyGraphWithDelegate(theArmnnDelegate.get()) == kTfLiteOk);
+
+    // Set input data
+    armnnDelegate::FillInput<T>(tfLiteInterpreter, 0, inputValues);
+    armnnDelegate::FillInput<T>(armnnDelegateInterpreter, 0, inputValues);
+
+    // Run EnqueueWorkload
+    CHECK(tfLiteInterpreter->Invoke() == kTfLiteOk);
+    CHECK(armnnDelegateInterpreter->Invoke() == kTfLiteOk);
+
+    // Compare output data
+    auto tfLiteDelegateOutputId = tfLiteInterpreter->outputs()[0];
+    auto tfLiteDelagateOutputData = tfLiteInterpreter->typed_tensor<float>(tfLiteDelegateOutputId);
+    auto armnnDelegateOutputId = armnnDelegateInterpreter->outputs()[0];
+    auto armnnDelegateOutputData = armnnDelegateInterpreter->typed_tensor<float>(armnnDelegateOutputId);
+
+    armnnDelegate::CompareData(expectedOutputValues.data(), armnnDelegateOutputData, expectedOutputValues.size(), 1);
+    armnnDelegate::CompareData(expectedOutputValues.data(), tfLiteDelagateOutputData, expectedOutputValues.size(), 1);
+    armnnDelegate::CompareData(tfLiteDelagateOutputData, armnnDelegateOutputData, expectedOutputValues.size(), 1);
+}
+#endif
+
 template <typename T>
 std::vector<char> CreateTransposeConvTfLiteModel(tflite::TensorType tensorType,
                                                  uint32_t strideX,
diff --git a/docs/01_03_delegate.dox b/docs/01_03_delegate.dox
index 04f216a87d..2d30e653fa 100644
--- a/docs/01_03_delegate.dox
+++ b/docs/01_03_delegate.dox
@@ -51,6 +51,8 @@ The Arm NN SDK TensorFlow Lite delegate currently supports the following operato
 
 - CONV_2D, Supported Fused Activation: RELU , RELU6 , TANH, NONE
 
+- CONV_3D, Supported Fused Activation: RELU , RELU6 , TANH, NONE
+
 - DEPTH_TO_SPACE
 
 - DEPTHWISE_CONV_2D, Supported Fused Activation: RELU , RELU6 , TANH, NONE
-- 
cgit v1.2.1