4 files changed, 203 insertions, 0 deletions
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 549222bd7a..d2ebd4cde6 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -1167,6 +1167,11 @@ OptimizationResult ApplyBackendOptimizations(OptimizedNetworkImpl* optNetObjPtr,
         auto backendObjPtr = backends.find(selectedBackend)->second.get();
         ARMNN_ASSERT(backendObjPtr);
 
+        if(selectedBackend == armnn::Compute::GpuAcc || selectedBackend == armnn::Compute::CpuAcc)
+        {
+            Optimizer::Pass(optGraph, MakeOptimizations(optimizations::PermuteDepthwiseConv2dWeights()));
+        }
+
         // Select sub-graphs based on backend
         SubgraphViewSelector::Subgraphs subgraphs =
                 SubgraphViewSelector::SelectSubgraphs(optGraph,
diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp
index 2bc54d993d..38c4ac9462 100644
--- a/src/armnn/optimizations/All.hpp
+++ b/src/armnn/optimizations/All.hpp
@@ -18,6 +18,7 @@
 #include "OptimizeInversePermutes.hpp"
 #include "PermuteAsReshape.hpp"
 #include "PermuteAndBatchToSpaceAsDepthToSpace.hpp"
+#include "PermuteDepthwiseConv2dWeights.hpp"
 #include "RedirectMembersToConstantInputs.hpp"
 #include "SquashEqualSiblings.hpp"
 #include "TransposeAsReshape.hpp"
 \ No newline at end of file
diff --git a/src/armnn/optimizations/PermuteDepthwiseConv2dWeights.hpp b/src/armnn/optimizations/PermuteDepthwiseConv2dWeights.hpp
new file mode 100644
index 0000000000..d49ddb9f68
--- /dev/null
+++ b/src/armnn/optimizations/PermuteDepthwiseConv2dWeights.hpp
@@ -0,0 +1,81 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "Optimization.hpp"
+#include "NetworkUtils.hpp"
+
+#include <armnnUtils/Permute.hpp>
+
+#include <fmt/format.h>
+
+namespace armnn
+{
+namespace optimizations
+{
+
+class PermuteDepthwiseConv2dWeightsImpl
+{
+public:
+
+    void Run(Graph& graph, Layer& layer) const
+    {
+        if (layer.GetType() == LayerType::DepthwiseConvolution2d)
+        {
+            AddPermuteLayer(graph, PolymorphicDowncast<DepthwiseConvolution2dLayer*>(&layer));
+        }
+    }
+
+protected:
+    PermuteDepthwiseConv2dWeightsImpl() = default;
+    ~PermuteDepthwiseConv2dWeightsImpl() = default;
+
+private:
+    /// ArmNN format for weights for depthwise is [1, H, W, C] independently of the input/output layout
+    ///
+    /// ACL format for weights for depthwise is:
+    /// - [1, H, W, C] for [N, H, W, C] input/output layout (matches with ArmNN)
+    /// - [1, C, H, W] for [N, C, H, W] input/output layout
+    ///
+    /// Therefore ArmNN weights have to be permuted when input/output layout is [N, C, H, W] to pass them to ACL.
+    static void AddPermuteLayer(Graph& graph, DepthwiseConvolution2dLayer* layer)
+    {
+        TensorInfo inputInfo = layer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+        TensorInfo weightInfo = layer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
+        if (layer->GetParameters().m_DataLayout == armnn::DataLayout::NHWC)
+        {
+            // No permutation required. Input and weights data layouts are the same.
+            return;
+        }
+        else if (layer->GetParameters().m_DataLayout == armnn::DataLayout::NCHW)
+        {
+            // Weights permutation required. Weights [N,H,W,C] and input [N,C,H,W] data layouts are different.
+            // [ 1, H, W, I*M] --> [ 1, I * M, H, W ]
+            PermutationVector permutationVector = { 0, 2, 3, 1 };
+            TensorInfo weightsPermuted = armnnUtils::Permuted(weightInfo, permutationVector);
+
+            // Inserts NewLayer so layers don't need to be re-sorted.
+            PermuteLayer* permuteLayer =
+                graph.InsertNewLayer<PermuteLayer>(layer->GetInputSlot(1),
+                                                   PermuteDescriptor(permutationVector),
+                                                   "permute_layer");
+            permuteLayer->GetOutputSlot().SetTensorInfo(weightsPermuted);
+
+            // Assign Permute BackendId to be the same as the Depthwise Conv2d BackendId.
+            // Needed as backends have already been assigned at this stage.
+            permuteLayer->SetBackendId(layer->GetBackendId());
+        }
+        else
+        {
+            throw InvalidArgumentException(fmt::format("Unknown data layout for tensor info conversion: {}",
+                                                       GetDataLayoutName(layer->GetParameters().m_DataLayout)));
+        }
+    }
+};
+
+using PermuteDepthwiseConv2dWeights = OptimizeForType<Layer, PermuteDepthwiseConv2dWeightsImpl>;
+
+} // namespace optimizations
+} // namespace armnn
diff --git a/src/armnn/test/optimizations/PermuteDepthwiseConv2dWeightsTests.cpp b/src/armnn/test/optimizations/PermuteDepthwiseConv2dWeightsTests.cpp
new file mode 100644
index 0000000000..24dab7f779
--- /dev/null
+++ b/src/armnn/test/optimizations/PermuteDepthwiseConv2dWeightsTests.cpp
@@ -0,0 +1,116 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../armnnTestUtils/GraphUtils.hpp"
+#include "../armnnTestUtils/TestUtils.hpp"
+
+#include <armnn/INetwork.hpp>
+
+#include <doctest/doctest.h>
+
+using namespace armnn;
+
+namespace
+{
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+armnn::INetworkPtr CreateSimpleDepthwiseConv2dNetwork(const armnn::TensorInfo& inputTensorInfo,
+                                                      const armnn::TensorInfo& outputTensorInfo,
+                                                      const armnn::TensorInfo& weightsTensorInfo,
+                                                      armnn::DepthwiseConvolution2dDescriptor descriptor)
+{
+    armnn::INetworkPtr network(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* inputLayer  = network->AddInputLayer(0, "input");
+    armnn::IConnectableLayer* weightsInputLayer   = network->AddInputLayer(1, "weights_input");
+    armnn::IConnectableLayer* depthwiseLayer = network->AddDepthwiseConvolution2dLayer(descriptor, "depthwise_conv2d");
+    armnn::IConnectableLayer* outputLayer = network->AddOutputLayer(0, "output");
+
+    inputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
+    weightsInputLayer->GetOutputSlot(0).SetTensorInfo(weightsTensorInfo);
+    depthwiseLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+    inputLayer->GetOutputSlot(0).Connect(depthwiseLayer->GetInputSlot(0));
+    weightsInputLayer->GetOutputSlot(0).Connect(depthwiseLayer->GetInputSlot(1));
+    depthwiseLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+    return network;
+}
+
+void PermuteDepthwiseConv2dWeightsTestRunner(INetworkPtr& network,
+                                             const TensorShape& outputShape,
+                                             Compute backendId)
+{
+    // Create ArmNN runtime
+    IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions());
+
+    // Optimise ArmNN network
+    IOptimizedNetworkPtr optNet = Optimize(*network, {backendId}, run->GetDeviceSpec());
+
+    Graph& graph = GetGraphForTesting(optNet.get());
+
+    CHECK(graph.GetNumLayers() == 5);
+    CHECK(CheckSequence(graph.cbegin(),
+                        graph.cend(),
+                        &IsLayerOfType<InputLayer>,
+                        &IsLayerOfType<InputLayer>,
+                        &IsLayerOfType<PermuteLayer>,
+                        &IsLayerOfType<DepthwiseConvolution2dLayer>,
+                        &IsLayerOfType<OutputLayer>));
+
+    armnn::Layer* const permuteLayer = GetFirstLayerWithName(graph, "permute_layer");
+    CHECK(permuteLayer);
+
+    // Swap original shape to compare with new shape.
+    unsigned int weightsShape[] = {outputShape[0], outputShape[1], outputShape[2], outputShape[3]};
+
+    // Tensorshape and the data type are correct
+    // [ 1, H, W, I*M] --> [ 1, I * M, H, W ]
+    TensorShape newShape = permuteLayer->GetOutputSlot().GetTensorInfo().GetShape();
+    CHECK((newShape[0] == weightsShape[0]));
+    CHECK((newShape[1] == weightsShape[3]));
+    CHECK((newShape[2] == weightsShape[1]));
+    CHECK((newShape[3] == weightsShape[2]));
+}
+
+void PermuteDepthwiseConv2dWeightsTest(Compute backendId)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 2, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo outputTensorInfo({ 1, 2 }, armnn::DataType::Float32);
+    armnn::TensorInfo weightsTensorInfo({ 2, 6 }, armnn::DataType::Float32);
+
+    DepthwiseConvolution2dDescriptor descriptor;
+    descriptor.m_BiasEnabled = false;
+
+    armnn::INetworkPtr network = CreateSimpleDepthwiseConv2dNetwork(inputTensorInfo,
+                                                                    outputTensorInfo,
+                                                                    weightsTensorInfo,
+                                                                    descriptor);
+
+    PermuteDepthwiseConv2dWeightsTestRunner(network,
+                                            weightsTensorInfo.GetShape(),
+                                            backendId);
+}
+#endif
+}
+
+#if defined(ARMCOMPUTECL_ENABLED)
+TEST_SUITE("Optimizer_PermuteDepthwiseConv2dWeightsGpuAcc")
+{
+TEST_CASE("PermuteDepthwiseConv2dWeightsGpuAccTest")
+{
+    PermuteDepthwiseConv2dWeightsTest(Compute::GpuAcc);
+}
+}
+#endif
+
+#if defined(ARMCOMPUTENEON_ENABLED)
+TEST_SUITE("Optimizer_PermuteDepthwiseConv2dWeightsCpuAcc")
+{
+TEST_CASE("PermuteDepthwiseConv2dWeightsCpuAccTest")
+{
+    PermuteDepthwiseConv2dWeightsTest(Compute::CpuAcc);
+}
+}
+#endif