13 files changed, 503 insertions, 177 deletions
diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
index 74ab789402..de3a34ee08 100644
--- a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
@@ -263,28 +263,13 @@ LayerType* FuseDepthwiseConvolution2dLayer(OptimizationViews& optimizationViews,
                                            ActivationDescriptor& activationDesc,
                                            std::string name)
 {
-    std::shared_ptr<ConstTensorHandle> weightHandle = baseLayer->m_Weight;
-    TensorInfo weightInfo = weightHandle->GetTensorInfo();
-
-    std::shared_ptr<ConstTensorHandle> biasHandle = baseLayer->m_Bias;
-    ConstTensor biasTensor;
-    if (!biasHandle)
-    {
-        biasTensor = ConstTensor();
-    }
-    else
-    {
-        biasTensor = ConstTensor(biasHandle->GetTensorInfo(), biasHandle->Map(true));
-    }
-
-    IConnectableLayer* replacement =
-        optimizationViews.GetINetwork()->
-            AddDepthwiseConvolution2dLayer(baseLayer->GetParameters(),
-                                           ConstTensor(weightInfo, weightHandle->Map(true)),
-                                           Optional<ConstTensor>(biasTensor),
-                                           name.c_str());
+    IConnectableLayer* replacement = optimizationViews.GetINetwork()->
+        AddDepthwiseConvolution2dLayer(baseLayer->GetParameters(), name.c_str());
     LayerType* replacementLayer = PolymorphicDowncast<LayerType*>(replacement);
 
+    replacementLayer->m_Weight = std::move(baseLayer->m_Weight);
+    replacementLayer->m_Bias = std::move(baseLayer->m_Bias);
+
     FuseLayer(optimizationViews,
               baseLayer,
               replacementLayer,
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index d89b5899ba..7a46741964 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -1382,7 +1382,13 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa
 {
     const std::string descriptorName{"DepthwiseConvolution2dQueueDescriptor"};
 
-    ValidateNumInputs(workloadInfo,  descriptorName, 1);
+    uint32_t numInputs = 2;
+    if (m_Parameters.m_BiasEnabled)
+    {
+        numInputs = 3;
+    }
+
+    ValidateNumInputs(workloadInfo,  descriptorName, numInputs);
     ValidateNumOutputs(workloadInfo, descriptorName, 1);
 
     const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
@@ -1391,9 +1397,7 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa
     ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 4, "input");
     ValidateTensorNumDimensions(outputTensorInfo, descriptorName, 4, "output");
 
-    ValidatePointer(m_Weight, descriptorName, "weight");
-
-    const TensorInfo& weightTensorInfo = m_Weight->GetTensorInfo();
+    const TensorInfo& weightTensorInfo = workloadInfo.m_InputTensorInfos[1];
     ValidateTensorNumDimensions(weightTensorInfo, descriptorName, 4, "weight");
 
     if (m_Parameters.m_DilationX < 1 || m_Parameters.m_DilationY < 1 )
@@ -1447,9 +1451,7 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa
     Optional<TensorInfo> optionalBiasTensorInfo;
     if (m_Parameters.m_BiasEnabled)
     {
-        ValidatePointer(m_Bias, descriptorName, "bias");
-
-        optionalBiasTensorInfo = MakeOptional<TensorInfo>(m_Bias->GetTensorInfo());
+        optionalBiasTensorInfo = MakeOptional<TensorInfo>(workloadInfo.m_InputTensorInfos[2]);
         const TensorInfo& biasTensorInfo = optionalBiasTensorInfo.value();
 
         ValidateBiasTensorQuantization(biasTensorInfo, inputTensorInfo, weightTensorInfo, descriptorName);
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index 5847e8cc21..f624ee6021 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -324,10 +324,13 @@ bool IWorkloadFactory::IsLayerConfigurationSupported(const BackendId& backendId,
         case LayerType::DepthwiseConvolution2d:
         {
             auto cLayer = PolymorphicDowncast<const DepthwiseConvolution2dLayer*>(&layer);
-            const TensorInfo& input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(),
-                                                       dataType);
-            const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType);
-            ARMNN_ASSERT(cLayer->m_Weight.get() != nullptr);
+            const TensorInfo& input   = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(),
+                                                         dataType);
+            const TensorInfo& output  = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType);
+            const TensorInfo& weights = OverrideDataType(layer.GetInputSlot(1).GetConnection()->GetTensorInfo(),
+                                                         dataType);
+
+            ARMNN_ASSERT(cLayer->GetInputSlot(1).GetConnection() != nullptr);
 
             const DepthwiseConvolution2dDescriptor& descriptor = cLayer->GetParameters();
 
@@ -335,17 +338,16 @@ bool IWorkloadFactory::IsLayerConfigurationSupported(const BackendId& backendId,
             Optional<TensorInfo> biases;
             if (descriptor.m_BiasEnabled)
             {
-                biases =
-                    OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType));
+                biases = OverrideDataType(cLayer->GetInputSlot(2).GetConnection()->GetTensorInfo(),
+                                          GetBiasTypeFromWeightsType(dataType));
             }
 
-            result = layerSupportObject.IsDepthwiseConvolutionSupported(
-                                                     input,
-                                                     output,
-                                                     descriptor,
-                                                     OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType),
-                                                     biases,
-                                                     reason);
+            result = layerSupportObject.IsDepthwiseConvolutionSupported(input,
+                                                                        output,
+                                                                        descriptor,
+                                                                        weights,
+                                                                        biases,
+                                                                        reason);
             break;
         }
         case LayerType::Dequantize:
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index 06d230b006..991f37d17e 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -21,6 +21,7 @@ list(APPEND armnnBackendsCommonUnitTests_sources
     DataTypeUtils.hpp
     DefaultAsyncExecuteTest.cpp
     DepthToSpaceEndToEndTestImpl.hpp
+    DepthwiseConvolution2dEndToEndTests.hpp
     DequantizeEndToEndTestImpl.hpp
     DetectionPostProcessEndToEndTestImpl.hpp
     DynamicBackendTests.cpp
diff --git a/src/backends/backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp b/src/backends/backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp
new file mode 100644
index 0000000000..1f9b60a4f2
--- /dev/null
+++ b/src/backends/backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp
@@ -0,0 +1,183 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "EndToEndTestImpl.hpp"
+#include <armnnUtils/QuantizeHelper.hpp>
+
+#include <ResolveType.hpp>
+
+#include <CommonTestUtils.hpp>
+#include <armnnTestUtils/DataLayoutUtils.hpp>
+
+#include <map>
+#include <vector>
+
+namespace
+{
+
+armnn::INetworkPtr CreateDepthwiseConvolution2dNetwork(const armnn::DepthwiseConvolution2dDescriptor& descriptor,
+                                                       const armnn::TensorInfo& inputInfo,
+                                                       const armnn::TensorInfo& weightsInfo,
+                                                       const armnn::TensorInfo& biasInfo,
+                                                       const armnn::TensorInfo& outputInfo,
+                                                       const armnn::ConstTensor& weights,
+                                                       const armnn::ConstTensor& biases)
+{
+    using namespace armnn;
+
+    INetworkPtr network(INetwork::Create());
+    IConnectableLayer* input = network->AddInputLayer(0, "input");
+    armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights, "Weights");
+    armnn::IConnectableLayer* biasLayer = network->AddConstantLayer(biases, "Bias");
+    IConnectableLayer* convolution2d = network->AddDepthwiseConvolution2dLayer(descriptor, "depthwiseConvolution2d");
+    IConnectableLayer* output = network->AddOutputLayer(0, "output");
+
+    Connect(input, convolution2d, inputInfo, 0, 0);
+    Connect(weightsLayer, convolution2d, weightsInfo, 0, 1);
+    Connect(biasLayer, convolution2d, biasInfo, 0, 2);
+    Connect(convolution2d, output, outputInfo, 0, 0);
+
+    return network;
+}
+
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType>
+void DepthwiseConvolution2dEndToEnd(const std::vector<armnn::BackendId>& backends,
+                                    armnn::DataLayout dataLayout)
+{
+    using namespace armnn;
+    using T  = ResolveType<ArmnnType>;
+    using BT = ResolveType<ArmnnBType>;
+
+    const float   qScale  = IsQuantizedType<T>() ? 0.25f : 1.0f;
+    const int32_t qOffset = IsQuantizedType<T>() ? 50    : 0;
+
+    unsigned int depthMultiplier = 2;
+
+    unsigned int inputHeight    = 8;
+    unsigned int inputWidth     = 16;
+    unsigned int inputChannels  = 2;
+    unsigned int inputBatchSize = 1;
+
+    unsigned int kernelHeight = 5;
+    unsigned int kernelWidth  = 3;
+
+    unsigned int outputHeight    = inputHeight - kernelHeight + 1 + 2;
+    unsigned int outputWidth     = (inputWidth - kernelWidth + 1)/2;
+    unsigned int outputChannels  = inputChannels * depthMultiplier;
+    unsigned int outputBatchSize = inputBatchSize;
+
+    TensorInfo inputInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth }, ArmnnType, qScale, qOffset, true);
+    TensorInfo outputInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth }, ArmnnType, qScale, qOffset);
+    TensorInfo weightsInfo({1, kernelHeight, kernelWidth, outputChannels}, ArmnnType, qScale, qOffset, true);
+    TensorInfo biasesInfo({outputChannels}, ArmnnBType, qScale * qScale, 0, true);
+
+    std::vector<float> inputData =
+    {
+        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
+   };
+
+    std::vector<float> weightsData =
+    {
+        1.0f,  1.0f, 1.0f,
+        1.0f, -1.0f, 1.0f,
+        1.0f,  1.0f, 1.0f,
+        1.0f,  1.0f, 1.0f,
+        1.0f,  1.0f, 1.0f,
+
+        2.0f,  2.0f, 2.0f,
+        2.0f,  2.0f, 2.0f,
+        2.0f,  2.0f, 2.0f,
+        2.0f,  2.0f, 2.0f,
+        2.0f,  2.0f, 2.0f,
+
+        0.0f,  0.0f, 0.0f,
+        0.0f, -1.0f, 0.0f,
+        0.0f,  0.0f, 0.0f,
+        0.0f,  0.0f, 0.0f,
+        0.0f,  0.0f, 0.0f,
+
+        0.0f,  0.0f, 0.0f,
+        0.0f,  0.0f, 0.0f,
+        0.0f,  1.0f, 0.0f,
+        0.0f,  0.0f, 0.0f,
+        0.0f,  0.0f, 0.0f
+    };
+
+    std::vector<float> biasesData = { 0.0f, 2.0f, 1.0f, -1.0f };
+
+    std::vector<float> expectedOutputData =
+    {
+        3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f,
+        5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f,
+        5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f,
+        2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f,
+        4.5f, 4.5f, 4.5f, 4.5f, 4.5f, 4.5f, 4.5f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f,
+        6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f,
+        1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
+   };
+
+    DepthwiseConvolution2dDescriptor descriptor;
+    descriptor.m_PadLeft     = 0;
+    descriptor.m_PadRight    = 0;
+    descriptor.m_PadTop      = 1;
+    descriptor.m_PadBottom   = 0;
+    descriptor.m_StrideX     = 2;
+    descriptor.m_StrideY     = 1;
+    descriptor.m_BiasEnabled = true;
+    descriptor.m_DataLayout  = dataLayout;
+
+    // Permute input and output if NCDHW.
+    if (dataLayout == DataLayout::NCHW)
+    {
+        PermuteTensorNhwcToNchw(inputInfo, inputData);
+        PermuteTensorNhwcToNchw(outputInfo, expectedOutputData);
+    }
+
+    // Quantize data
+    std::vector<T> qInputData          = armnnUtils::QuantizedVector<T>(inputData, qScale, qOffset);
+    std::vector<T> qWeightsData        = armnnUtils::QuantizedVector<T>(weightsData, qScale, qOffset);
+    std::vector<T> qExpectedOutputData = armnnUtils::QuantizedVector<T>(expectedOutputData, qScale, qOffset);
+
+    std::vector<BT> qBiasesData = armnnUtils::QuantizedVector<BT>(biasesData, qScale * qScale, 0);
+
+    ConstTensor weights(weightsInfo, qWeightsData);
+    ConstTensor biases(biasesInfo, qBiasesData);
+
+    INetworkPtr network = CreateDepthwiseConvolution2dNetwork(descriptor,
+                                                              inputInfo,
+                                                              weightsInfo,
+                                                              biasesInfo,
+                                                              outputInfo,
+                                                              weights,
+                                                              biases);
+
+    EndToEndLayerTestImpl<ArmnnType, ArmnnType>(std::move(network),
+                                                { { 0, qInputData } },
+                                                { { 0, qExpectedOutputData } },
+                                                backends);
+}
diff --git a/src/backends/backendsCommon/test/OptimizationViewsTests.cpp b/src/backends/backendsCommon/test/OptimizationViewsTests.cpp
index f0f5b632de..1219ac5a33 100644
--- a/src/backends/backendsCommon/test/OptimizationViewsTests.cpp
+++ b/src/backends/backendsCommon/test/OptimizationViewsTests.cpp
@@ -128,20 +128,16 @@ TEST_CASE("OptimizedViewsSubgraphLayerCountUsingGetINetwork")
     IConnectableLayer* const inputLayer = view.GetINetwork()->AddInputLayer(0, "input");
 
     DepthwiseConvolution2dDescriptor convDescriptor;
-    PreCompiledDescriptor substitutionLayerDescriptor(1, 1);
+    PreCompiledDescriptor substitutionLayerDescriptor(2, 1);
     CompiledBlobPtr blobPtr;
     BackendId backend = Compute::CpuRef;
 
     Layer* convLayer1 = PolymorphicDowncast<Layer*>(
         view.GetINetwork()->AddDepthwiseConvolution2dLayer(convDescriptor,
-                                                           ConstTensor(),
-                                                           Optional<ConstTensor>(),
                                                            "conv1"));
 
     Layer* convLayer2 = PolymorphicDowncast<Layer*>(
         view.GetINetwork()->AddDepthwiseConvolution2dLayer(convDescriptor,
-                                                           ConstTensor(),
-                                                           Optional<ConstTensor>(),
                                                            "conv2"));
 
     IConnectableLayer* const outputLayer = view.GetINetwork()->AddOutputLayer(0, "output");
diff --git a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
index bd7cc40f27..4203fed23a 100644
--- a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
+++ b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
@@ -1736,19 +1736,38 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
     }
 
     std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> weightsHandle = tensorHandleFactory.CreateTensorHandle(kernelDesc);
+    std::unique_ptr<armnn::ITensorHandle> biasHandle = nullptr;
     std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
 
+    armnn::DepthwiseConvolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+
     armnn::ScopedTensorHandle weightsTensor(kernelDesc);
 
+    // AllocateAndCopyDataToITensorHandle() is required twice for the weights AND biases:
+    // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight).
+    //    Needed in Neon and Cl Workload when permuting. Backend TensorHandle in (2) below will not work.
+    // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method.
+    //    Cannot PolymorphicDowncast from ScopedTensorHandle->RefTensorHandle.
+    //    Need to PolymorphicDowncast from ITensorHandle->RefTensorHandle.
     AllocateAndCopyDataToITensorHandle(&weightsTensor, kernel.data());
+    AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernel.data()); // required for ConstantTensor
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, kernelDesc, weightsHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
 
     armnn::ScopedTensorHandle biasTensor(biasDesc);
     if (biasEnabled)
     {
         AllocateAndCopyDataToITensorHandle(&biasTensor, bias.data());
+
+        biasHandle = tensorHandleFactory.CreateTensorHandle(biasDesc);
+        AllocateAndCopyDataToITensorHandle(biasHandle.get(), bias.data());
+        AddInputToWorkload(data, info, biasDesc, biasHandle.get());
     }
 
-    armnn::DepthwiseConvolution2dQueueDescriptor data;
     data.m_Weight = &weightsTensor;
     data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - it can be a source of bugs.
     data.m_Parameters.m_StrideX = strideX;
@@ -1760,12 +1779,9 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
     data.m_Parameters.m_BiasEnabled = biasEnabled;
     data.m_Parameters.m_DataLayout = layout;
 
-    armnn::WorkloadInfo info;
-    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
-    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
-
     std::unique_ptr<armnn::IWorkload> workload
             = workloadFactory.CreateWorkload(armnn::LayerType::DepthwiseConvolution2d, data, info);
+
     inputHandle->Allocate();
     outputHandle->Allocate();
 
@@ -1890,19 +1906,35 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
     std::vector<T> actualOutput(outputTensorInfo.GetNumElements());
 
     std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> weightsHandle = tensorHandleFactory.CreateTensorHandle(kernelDesc);
+    std::unique_ptr<armnn::ITensorHandle> biasHandle = nullptr;
     std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
 
     armnn::DepthwiseConvolution2dQueueDescriptor data;
     armnn::WorkloadInfo info;
-    armnn::ScopedTensorHandle weightsTensor(kernelDesc);
-    armnn::ScopedTensorHandle biasTensor(biasDesc);
 
-    AllocateAndCopyDataToITensorHandle(&weightsTensor, kernelData.data());
-    AllocateAndCopyDataToITensorHandle(&biasTensor, biasV.data());
+    armnn::ScopedTensorHandle weightsTensor(kernelDesc);
+    // AllocateAndCopyDataToITensorHandle() is required twice for the weights AND biases:
+    // See comment in DepthwiseConvolution2dAsymmetricTestImpl() for reasons.
+    // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight).
+    // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method.
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, kernelData.data()); // required for QueueDescriptor
+    AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernelData.data()); // required for ConstantTensor
 
     AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, kernelDesc, weightsHandle.get());
     AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
 
+    armnn::ScopedTensorHandle biasTensor(biasDesc);
+    if (biasEnabled)
+    {
+        AllocateAndCopyDataToITensorHandle(&biasTensor, biasV.data());
+
+        biasHandle = tensorHandleFactory.CreateTensorHandle(biasDesc);
+        AllocateAndCopyDataToITensorHandle(biasHandle.get(), biasV.data());
+        AddInputToWorkload(data, info, biasDesc, biasHandle.get());
+    }
+
     data.m_Weight = &weightsTensor;
     data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled.
     data.m_Parameters.m_StrideX = 1;
@@ -1916,6 +1948,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
 
     std::unique_ptr<armnn::IWorkload> workload
             = workloadFactory.CreateWorkload(armnn::LayerType::DepthwiseConvolution2d, data, info);
+
     inputHandle->Allocate();
     outputHandle->Allocate();
 
@@ -2086,19 +2119,35 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
     std::vector<T> actualOutput(outputTensorInfo.GetNumElements());
 
     std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> weightsHandle = tensorHandleFactory.CreateTensorHandle(kernelDesc);
+    std::unique_ptr<armnn::ITensorHandle> biasHandle = nullptr;
     std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
 
     armnn::DepthwiseConvolution2dQueueDescriptor data;
     armnn::WorkloadInfo info;
-    armnn::ScopedTensorHandle weightsTensor(kernelDesc);
-    armnn::ScopedTensorHandle biasTensor(biasDesc);
 
-    AllocateAndCopyDataToITensorHandle(&weightsTensor, kernelData.data());
-    AllocateAndCopyDataToITensorHandle(&biasTensor, biasV.data());
+    armnn::ScopedTensorHandle weightsTensor(kernelDesc);
+    // AllocateAndCopyDataToITensorHandle() is required twice for the weights AND biases:
+    // See comment in DepthwiseConvolution2dAsymmetricTestImpl() for reasons.
+    // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight).
+    // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method.
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, kernelData.data()); // required for QueueDescriptor
+    AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernelData.data()); // required for ConstantTensor
 
     AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, kernelDesc, weightsHandle.get());
     AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
 
+    armnn::ScopedTensorHandle biasTensor(biasDesc);
+    if (biasEnabled)
+    {
+        AllocateAndCopyDataToITensorHandle(&biasTensor, biasV.data());
+
+        biasHandle = tensorHandleFactory.CreateTensorHandle(biasDesc);
+        AllocateAndCopyDataToITensorHandle(biasHandle.get(), biasV.data());
+        AddInputToWorkload(data, info, biasDesc, biasHandle.get());
+    }
+
     data.m_Weight = &weightsTensor;
     data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled.
     data.m_Parameters.m_StrideX = 2;
@@ -2112,6 +2161,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
 
     std::unique_ptr<armnn::IWorkload> workload
             = workloadFactory.CreateWorkload(armnn::LayerType::DepthwiseConvolution2d, data, info);
+
     inputHandle->Allocate();
     outputHandle->Allocate();
 
@@ -2247,22 +2297,34 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
     }
 
     std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> weightsHandle = tensorHandleFactory.CreateTensorHandle(kernelDesc);
+    std::unique_ptr<armnn::ITensorHandle> biasHandle = nullptr;
     std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
 
     armnn::DepthwiseConvolution2dQueueDescriptor data;
     armnn::WorkloadInfo info;
+
     armnn::ScopedTensorHandle weightsTensor(kernelDesc);
-    armnn::ScopedTensorHandle biasTensor(biasDesc);
+    // AllocateAndCopyDataToITensorHandle() is required twice for the weights AND biases:
+    // See comment in DepthwiseConvolution2dAsymmetricTestImpl() for reasons.
+    // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight).
+    // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method.
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, originalKernel.data()); // required for QueueDescriptor
+    AllocateAndCopyDataToITensorHandle(weightsHandle.get(), originalKernel.data()); // required for ConstantTensor
 
-    AllocateAndCopyDataToITensorHandle(&weightsTensor, originalKernel.data());
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, kernelDesc, weightsHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
 
-    if(biasEnabled)
+    armnn::ScopedTensorHandle biasTensor(biasDesc);
+    if (biasEnabled)
     {
         AllocateAndCopyDataToITensorHandle(&biasTensor, bias.data());
-    }
 
-    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
-    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+        biasHandle = tensorHandleFactory.CreateTensorHandle(biasDesc);
+        AllocateAndCopyDataToITensorHandle(biasHandle.get(), bias.data());
+        AddInputToWorkload(data, info, biasDesc, biasHandle.get());
+    }
 
     data.m_Weight = &weightsTensor;
     data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - can be a source of bugs.
@@ -2279,6 +2341,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
 
     std::unique_ptr<armnn::IWorkload> workload
             = workloadFactory.CreateWorkload(armnn::LayerType::DepthwiseConvolution2d, data, info);
+
     inputHandle->Allocate();
     outputHandle->Allocate();
 
@@ -2970,18 +3033,30 @@ LayerTestResult<T, 4> CompareDepthwiseConvolution2dTestImpl(
     std::vector<T> expectedOutput(outputTensorInfo.GetNumElements());
 
     std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> weightsHandle = tensorHandleFactory.CreateTensorHandle(kernelDesc);
+    std::unique_ptr<armnn::ITensorHandle> biasHandle = tensorHandleFactory.CreateTensorHandle(biasDesc);
     std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
 
     armnn::DepthwiseConvolution2dQueueDescriptor data;
     armnn::WorkloadInfo info;
+
     armnn::ScopedTensorHandle weightsTensor(kernelDesc);
     armnn::ScopedTensorHandle biasTensor(biasDesc);
 
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, kernelDesc, weightsHandle.get());
+    AddInputToWorkload(data, info, biasDesc, biasHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    // AllocateAndCopyDataToITensorHandle() is required twice for the weights AND biases:
+    // See comment in DepthwiseConvolution2dAsymmetricTestImpl() for reasons.
+    // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight).
+    // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method.
+    AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernel.data());
     AllocateAndCopyDataToITensorHandle(&weightsTensor, kernel.data());
+    AllocateAndCopyDataToITensorHandle(biasHandle.get(), bias.data());
     AllocateAndCopyDataToITensorHandle(&biasTensor, bias.data());
 
-    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
-    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
     data.m_Weight = &weightsTensor;
     data.m_Bias = &biasTensor;
     data.m_Parameters.m_StrideX = strideX;
@@ -2994,11 +3069,15 @@ LayerTestResult<T, 4> CompareDepthwiseConvolution2dTestImpl(
     data.m_Parameters.m_DataLayout = layout.GetDataLayout();
 
     std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refTensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> weightsHandleRef = refTensorHandleFactory.CreateTensorHandle(kernelDesc);
+    std::unique_ptr<armnn::ITensorHandle> biasHandleRef = refTensorHandleFactory.CreateTensorHandle(biasDesc);
     std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refTensorHandleFactory.CreateTensorHandle(inputTensorInfo);
 
     armnn::DepthwiseConvolution2dQueueDescriptor refData = data;
     armnn::WorkloadInfo refInfo = info;
     SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadInput(refData, refInfo, 1, kernelDesc, weightsHandleRef.get());
+    SetWorkloadInput(refData, refInfo, 2, biasDesc, biasHandleRef.get());
     SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
 
     std::unique_ptr<armnn::IWorkload> workload
@@ -3007,6 +3086,8 @@ LayerTestResult<T, 4> CompareDepthwiseConvolution2dTestImpl(
             = refWorkloadFactory.CreateWorkload(armnn::LayerType::DepthwiseConvolution2d, refData, refInfo);
 
     outputHandleRef->Allocate();
+    weightsHandleRef->Allocate();
+    biasHandleRef->Allocate();
     inputHandleRef->Allocate();
 
     inputHandle->Allocate();
@@ -3014,6 +3095,8 @@ LayerTestResult<T, 4> CompareDepthwiseConvolution2dTestImpl(
 
     CopyDataToITensorHandle(inputHandle.get(), input.data());
     CopyDataToITensorHandle(inputHandleRef.get(), input.data());
+    CopyDataToITensorHandle(weightsHandleRef.get(), kernel.data());
+    CopyDataToITensorHandle(biasHandleRef.get(), bias.data());
 
     ExecuteWorkload(*workload, memoryManager);
 
@@ -3739,23 +3822,33 @@ LayerTestResult<uint8_t, 4> DepthwiseConvolution2dPerAxisQuantTest(
     descriptor.m_DataLayout  = layout;
 
     std::unique_ptr<ITensorHandle> inputHandle  = tensorHandleFactory.CreateTensorHandle(inputInfo);
+    std::unique_ptr<ITensorHandle> weightsHandle = tensorHandleFactory.CreateTensorHandle(kernelInfo);
+    std::unique_ptr<ITensorHandle> biasHandle = tensorHandleFactory.CreateTensorHandle(biasInfo);
     std::unique_ptr<ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputInfo);
 
+    DepthwiseConvolution2dQueueDescriptor queueDescriptor;
     WorkloadInfo workloadInfo;
     ScopedTensorHandle weightTensor(kernelInfo);
     ScopedTensorHandle biasTensor(biasInfo);
 
+    AddInputToWorkload(queueDescriptor, workloadInfo, inputInfo, inputHandle.get());
+    AddInputToWorkload(queueDescriptor, workloadInfo, kernelInfo, weightsHandle.get());
+    AddOutputToWorkload(queueDescriptor, workloadInfo, outputInfo, outputHandle.get());
+    AddInputToWorkload(queueDescriptor, workloadInfo, biasInfo, biasHandle.get());
+
+    // AllocateAndCopyDataToITensorHandle() is required twice for the weights AND biases:
+    // See comment in DepthwiseConvolution2dAsymmetricTestImpl() for reasons.
+    // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight).
+    // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method.
+    AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernelData.data());
     AllocateAndCopyDataToITensorHandle(&weightTensor, kernelData.data());
+    AllocateAndCopyDataToITensorHandle(biasHandle.get(), biasData.data());
     AllocateAndCopyDataToITensorHandle(&biasTensor, biasData.data());
 
-    DepthwiseConvolution2dQueueDescriptor queueDescriptor;
     queueDescriptor.m_Parameters = descriptor;
     queueDescriptor.m_Weight     = &weightTensor;
     queueDescriptor.m_Bias       = &biasTensor;
 
-    AddInputToWorkload(queueDescriptor, workloadInfo, inputInfo, inputHandle.get());
-    AddOutputToWorkload(queueDescriptor, workloadInfo, outputInfo, outputHandle.get());
-
     std::unique_ptr<IWorkload> workload = workloadFactory.CreateWorkload(armnn::LayerType::DepthwiseConvolution2d,
                                                                          queueDescriptor,
                                                                          workloadInfo);
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index 018adec781..ed6f221511 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -373,14 +373,14 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
 
                                 if (baseLayer->GetParameters().m_BiasEnabled)
                                 {
-                                    biases = baseLayer->m_Bias->GetTensorInfo();
+                                    biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
                                 }
 
                                 arm_compute::Status status = ClDepthwiseConvolutionWorkloadValidate(
                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
                                         baseLayer->GetParameters(),
-                                        baseLayer->m_Weight->GetTensorInfo(),
+                                        baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
                                         biases,
                                         &activationDesc);
 
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp
index 7089f23efa..7a258c38e0 100644
--- a/src/backends/neon/NeonBackend.cpp
+++ b/src/backends/neon/NeonBackend.cpp
@@ -225,14 +225,14 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph
 
                                 if (baseLayer->GetParameters().m_BiasEnabled)
                                 {
-                                    biases = baseLayer->m_Bias->GetTensorInfo();
+                                    biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
                                 }
 
                                 arm_compute::Status status = NeonDepthwiseConvolutionWorkloadValidate(
                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
                                         baseLayer->GetParameters(),
-                                        baseLayer->m_Weight->GetTensorInfo(),
+                                        baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
                                         biases,
                                         &activationDesc);
 
diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp
index 9445cb1c75..fcae77cdaa 100644
--- a/src/backends/neon/NeonTensorHandle.hpp
+++ b/src/backends/neon/NeonTensorHandle.hpp
@@ -241,6 +241,7 @@ private:
                 break;
             case arm_compute::DataType::QSYMM8:
             case arm_compute::DataType::QASYMM8_SIGNED:
+            case arm_compute::DataType::QSYMM8_PER_CHANNEL:
                 armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
                                                                  this->GetTensor());
                 break;
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 496b11db91..29df20f834 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -329,9 +329,9 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dUint8, DepthwiseConvolution2
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2d, DepthwiseConvolution2dTest, false, DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dUint8,
-                     DepthwiseConvolution2dUint8Test,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2dUint8Test,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dQSymm16, DepthwiseConvolution2dInt16Test, true, DataLayout::NCHW)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dNhwc, DepthwiseConvolution2dTest, true, DataLayout::NHWC)
@@ -339,148 +339,176 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dUint8Nhwc, DepthwiseConvolut
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dNhwc, DepthwiseConvolution2dTest, false, DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dUint8Nhwc,
-                     DepthwiseConvolution2dUint8Test,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2dUint8Test,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthNhwc, DepthwiseConvolution2dDepthNhwcTest, false)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleDepthwiseConvolution2d3x3Dilation3x3Nhwc,
-                     SimpleDepthwiseConvolution2d3x3Dilation3x3NhwcTest)
+                              SimpleDepthwiseConvolution2d3x3Dilation3x3NhwcTest)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3Nhwc,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3BFloat16,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3NhwcBFloat16,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3Int8,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3NhwcInt8,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3Uint8,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3NhwcUint8,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3Int16,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d3x3Dilation3x3NhwcInt16,
-                     DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
+                              false,
+                              DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3Nhwc,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::Float32, DataType::Float32>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3BFloat16,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3NhwcBFloat16,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::BFloat16, DataType::BFloat16>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3Int8,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3NhwcInt8,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmS8, DataType::Signed32>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3Uint8,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3NhwcUint8,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QAsymmU8, DataType::Signed32>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3Int16,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
-                     false,
-                     DataLayout::NCHW)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d2x3x3Dilation3x3NhwcInt16,
-                     DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
-                     false,
-                     DataLayout::NHWC)
+                              DepthwiseConvolution2d2x3x3Dilation3x3Test<DataType::QSymmS16, DataType::Signed32>,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dMult4,
-                     DepthwiseConvolution2dMult4Test<armnn::DataType::Float32, armnn::DataType::Float32>,
-                     false,
-                     armnn::DataLayout::NCHW)
+                              DepthwiseConvolution2dMult4Test<armnn::DataType::Float32, armnn::DataType::Float32>,
+                              false,
+                              armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dMult2,
-                     DepthwiseConvolution2dMult2Test<armnn::DataType::Float32, armnn::DataType::Float32>,
-                     false,
-                     armnn::DataLayout::NCHW)
+                              DepthwiseConvolution2dMult2Test<armnn::DataType::Float32, armnn::DataType::Float32>,
+                              false,
+                              armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dMult4BFloat16,
-                     DepthwiseConvolution2dMult4Test<armnn::DataType::BFloat16, armnn::DataType::BFloat16>,
-                     false,
-                     armnn::DataLayout::NCHW)
+                              DepthwiseConvolution2dMult4Test<armnn::DataType::BFloat16, armnn::DataType::BFloat16>,
+                              false,
+                              armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dMult2BFloat16,
-                     DepthwiseConvolution2dMult2Test<armnn::DataType::BFloat16, armnn::DataType::BFloat16>,
-                     false,
-                     armnn::DataLayout::NCHW)
+                              DepthwiseConvolution2dMult2Test<armnn::DataType::BFloat16, armnn::DataType::BFloat16>,
+                              false,
+                              armnn::DataLayout::NCHW)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthMul1,
-                     DepthwiseConvolution2dDepthMul1Test, true, DataLayout::NCHW)
+                              DepthwiseConvolution2dDepthMul1Test,
+                              true,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthMul1Uint8,
-                     DepthwiseConvolution2dDepthMul1Uint8Test, true, DataLayout::NCHW)
+                              DepthwiseConvolution2dDepthMul1Uint8Test,
+                              true,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthMul1Int16,
-                     DepthwiseConvolution2dDepthMul1Int16Test, true, DataLayout::NCHW)
+                              DepthwiseConvolution2dDepthMul1Int16Test,
+                              true,
+                              DataLayout::NCHW)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dDepthMul1,
-                     DepthwiseConvolution2dDepthMul1Test, false, DataLayout::NCHW)
+                              DepthwiseConvolution2dDepthMul1Test,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dDepthMul1Uint8,
-                     DepthwiseConvolution2dDepthMul1Uint8Test, false, DataLayout::NCHW)
+                              DepthwiseConvolution2dDepthMul1Uint8Test,
+                              false,
+                              DataLayout::NCHW)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthMul1Nhwc,
-                     DepthwiseConvolution2dDepthMul1Test, true, DataLayout::NHWC)
+                              DepthwiseConvolution2dDepthMul1Test,
+                              true,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthMul1Uint8Nhwc,
-                     DepthwiseConvolution2dDepthMul1Uint8Test, true, DataLayout::NHWC)
+                              DepthwiseConvolution2dDepthMul1Uint8Test,
+                              true,
+                              DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dDepthMul1Nhwc,
-                     DepthwiseConvolution2dDepthMul1Test, false, DataLayout::NHWC)
+                              DepthwiseConvolution2dDepthMul1Test,
+                              false,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dDepthMul1Uint8Nhwc,
-                     DepthwiseConvolution2dDepthMul1Uint8Test, false, DataLayout::NHWC)
+                              DepthwiseConvolution2dDepthMul1Uint8Test,
+                              false,
+                              DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dAsymmetric,
-                     DepthwiseConvolution2dAsymmetricTest, true, DataLayout::NCHW)
+                              DepthwiseConvolution2dAsymmetricTest,
+                              true,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dAsymmetric,
-                     DepthwiseConvolution2dAsymmetricTest, false, DataLayout::NCHW)
+                              DepthwiseConvolution2dAsymmetricTest,
+                              false,
+                              DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dAsymmetricNhwc,
-                     DepthwiseConvolution2dAsymmetricTest, true, DataLayout::NHWC)
+                              DepthwiseConvolution2dAsymmetricTest,
+                              true,
+                              DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dAsymmetricNhwc,
-                     DepthwiseConvolution2dAsymmetricTest, false, DataLayout::NHWC)
+                              DepthwiseConvolution2dAsymmetricTest,
+                              false,
+                              DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthMul64, DepthwiseConvolution2dDepthMul64Test);
 
-ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dPerAxisQuantTestNchw, DepthwiseConvolution2dPerAxisQuantTest,
-                     DataLayout::NCHW);
-ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dPerAxisQuantTestNhwc, DepthwiseConvolution2dPerAxisQuantTest,
-                     DataLayout::NHWC);
+ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dPerAxisQuantTestNchw,
+                              DepthwiseConvolution2dPerAxisQuantTest,
+                              DataLayout::NCHW);
+ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dPerAxisQuantTestNhwc,
+                              DepthwiseConvolution2dPerAxisQuantTest,
+                              DataLayout::NHWC);
 
 // [ Pooling 2D
 //MaxPooling
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
index ad5edde7e6..c1c3916292 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
@@ -19,16 +19,41 @@ RefDepthwiseConvolution2dWorkload::RefDepthwiseConvolution2dWorkload(
         const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
         : RefBaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
-    m_Weight = std::make_unique<ScopedTensorHandle>(*(descriptor.m_Weight));
-    const TensorInfo& rFilterInfo = m_Weight->GetTensorInfo();
-    m_FilterShape = rFilterInfo.GetShape();
-    m_FilterDecoder = MakeDecoder<float>(rFilterInfo, m_Weight->Map(true));
+    WorkloadInfo detailsInfo;
+    detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
+    detailsInfo.m_OutputTensorInfos = info.m_OutputTensorInfos;
+    detailsInfo.m_WeightsTensorInfo = armnn::Optional<armnn::TensorInfo>(info.m_InputTensorInfos[1]);
 
     if (descriptor.m_Parameters.m_BiasEnabled)
     {
-        m_Bias = std::make_unique<ScopedTensorHandle>(*(descriptor.m_Bias));
-        const TensorInfo& biasInfo = m_Bias->GetTensorInfo();
-        m_BiasDecoder = MakeDecoder<float>(biasInfo, m_Bias->Map(true));
+        detailsInfo.m_BiasTensorInfo = armnn::Optional<armnn::TensorInfo>(info.m_InputTensorInfos[2]);
+    }
+
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("RefDepthwiseConvolution2dWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         detailsInfo,
+                                         this->GetGuid());
+}
+
+void RefDepthwiseConvolution2dWorkload::PostAllocationConfigure()
+{
+    PostAllocationConfigure(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefDepthwiseConvolution2dWorkload::PostAllocationConfigure(std::vector<ITensorHandle*> inputs,
+                                                                std::vector<ITensorHandle*> outputs)
+{
+    IgnoreUnused(outputs);
+
+    const TensorInfo& rFilterInfo = GetTensorInfo(inputs[1]);
+    m_FilterShape = rFilterInfo.GetShape();
+    m_FilterDecoder = MakeDecoder<float>(rFilterInfo);
+
+    if (m_Data.m_Parameters.m_BiasEnabled)
+    {
+        const TensorInfo& biasInfo = GetTensorInfo(inputs[2]);
+        m_BiasDecoder = MakeDecoder<float>(biasInfo);
     }
 }
 
@@ -39,6 +64,8 @@ void RefDepthwiseConvolution2dWorkload::Execute() const
 
 void RefDepthwiseConvolution2dWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
 {
+    PostAllocationConfigure(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+
     Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
 }
 
@@ -54,6 +81,12 @@ void RefDepthwiseConvolution2dWorkload::Execute(std::vector<ITensorHandle*> inpu
     const TensorShape& inputShape = GetTensorInfo(inputs[0]).GetShape();
     const TensorShape& outputShape = GetTensorInfo(outputs[0]).GetShape();
 
+    m_FilterDecoder->Reset(inputs[1]->Map());
+    if (m_Data.m_Parameters.m_BiasEnabled)
+    {
+        m_BiasDecoder->Reset(inputs[2]->Map());
+    }
+
     Convolve(inputShape, *inputDecoder, outputShape, *OutputEncoder,
              m_FilterShape, *m_FilterDecoder, m_Data.m_Parameters.m_BiasEnabled, m_BiasDecoder.get(),
              m_Data.m_Parameters.m_DataLayout, m_Data.m_Parameters.m_PadTop, m_Data.m_Parameters.m_PadLeft,
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
index 5d4b483fa7..1c7de29b37 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
@@ -17,11 +17,13 @@ public:
     explicit RefDepthwiseConvolution2dWorkload(const DepthwiseConvolution2dQueueDescriptor &descriptor,
                                                const WorkloadInfo &info);
 
+    void PostAllocationConfigure() override;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void PostAllocationConfigure(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs);
     void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 
     std::unique_ptr <ScopedTensorHandle> m_Weight;