7 files changed, 253 insertions, 1 deletions
diff --git a/Android.mk b/Android.mk
index 3e665a039f..4c3789c9e0 100644
--- a/Android.mk
+++ b/Android.mk
@@ -91,6 +91,7 @@ LOCAL_SRC_FILES := \
         src/armnn/NetworkUtils.cpp \
         src/armnn/Observable.cpp \
         src/armnn/Optimizer.cpp \
+        src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp \
         src/armnn/ProfilingEvent.cpp \
         src/armnn/Profiling.cpp \
         src/armnn/Runtime.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e4d9c08d2..a2febe3066 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -420,6 +420,8 @@ list(APPEND armnn_sources
     src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp
     src/armnn/optimizations/OptimizeInverseConversions.hpp
     src/armnn/optimizations/OptimizeInversePermutes.hpp
+    src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp
+    src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp
     src/armnn/optimizations/PermuteAsReshape.hpp
     src/armnn/optimizations/SquashEqualSiblings.hpp
     src/profiling/CommandHandlerFunctor.cpp
@@ -560,6 +562,7 @@ if(BUILD_UNIT_TESTS)
         src/armnn/test/optimizations/OptimizeConsecutiveReshapesTests.cpp
         src/armnn/test/optimizations/OptimizeInverseConversionsTests.cpp
         src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp
+        src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
         src/armnn/test/optimizations/PermuteAsReshapeTests.cpp
         src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp
         src/armnn/test/OptionalTest.cpp
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index a668274c4d..cf9a138084 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -818,7 +818,8 @@ IOptimizedNetworkPtr Optimize(const INetwork& inNetwork,
                                                 MovePermuteUp(),
                                                 PermuteAsReshape(),
                                                 OptimizeConsecutiveReshapes(),
-                                                FoldPadIntoConvolution2d()));
+                                                FoldPadIntoConvolution2d(),
+                                                PermuteAndBatchToSpaceAsDepthToSpace()));
 
     // Infer the tensor infos for all output slots. Throws an exception on failure
     optGraph.InferTensorInfos();
diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp
index 68965fd23c..4ea3f7f2d4 100644
--- a/src/armnn/optimizations/All.hpp
+++ b/src/armnn/optimizations/All.hpp
@@ -14,3 +14,4 @@
 #include "ConvertFp32NetworkToFp16.hpp"
 #include "AddDebug.hpp"
 #include "FoldPadIntoConvolution2d.hpp"
+#include "PermuteAndBatchToSpaceAsDepthToSpace.hpp"
+\ No newline at end of file
diff --git a/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp b/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp
new file mode 100644
index 0000000000..c42162b6c1
--- /dev/null
+++ b/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp
@@ -0,0 +1,87 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "PermuteAndBatchToSpaceAsDepthToSpace.hpp"
+
+using namespace armnn;
+using namespace armnn::optimizations;
+
+void PermuteAndBatchToSpaceAsDepthToSpaceImpl::Run(Graph& graph, InputSlot& connection) const
+{
+    // Validate base layer (the Permute) is compatible
+    Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();
+    BOOST_ASSERT(base.GetType() == LayerType::Permute);
+    const TensorInfo& inputInfo        = base.GetInputSlot(0).GetConnection()->GetTensorInfo();
+    const TensorInfo& intermediateInfo = base.GetOutputSlot(0).GetTensorInfo();
+    if (intermediateInfo.GetNumDimensions() != 4)
+    {
+        // Must be 4D, otherwise the below checks do not make sense
+        return;
+    }
+    if (!static_cast<PermuteLayer&>(base).GetParameters().m_DimMappings.IsEqual(PermutationVector{ 3, 1, 2, 0 }))
+    {
+        // Must swap batch and channels dimensions, otherwise it is not the (original) channels dimension
+        // that is being decomposed.
+        return;
+    }
+
+    // Validate child layer (the BatchToSpace) is compatible
+    Layer& child = connection.GetOwningLayer();
+    BOOST_ASSERT(child.GetType() == LayerType::BatchToSpaceNd);
+    const TensorInfo& outputInfo                     = child.GetOutputSlot(0).GetTensorInfo();
+    const BatchToSpaceNdDescriptor& batchToSpaceDesc = static_cast<BatchToSpaceNdLayer&>(child).GetParameters();
+    if (batchToSpaceDesc.m_DataLayout != DataLayout::NHWC)
+    {
+        // The rest of this function assumes NHWC, although in future this restriction could be lifted.
+        return;
+    }
+    if (batchToSpaceDesc.m_Crops != std::vector<std::pair<unsigned int, unsigned int>>{ { 0, 0 }, { 0, 0 } })
+    {
+        // Cropping is not supported in DepthToSpace
+        return;
+    }
+    if (batchToSpaceDesc.m_BlockShape.size() != 2 ||
+        batchToSpaceDesc.m_BlockShape[0] != batchToSpaceDesc.m_BlockShape[1])
+    {
+        // Asymmetric or non-2D block sizes are not supported by DepthToSpace
+        return;
+    }
+    uint32_t blockSize = batchToSpaceDesc.m_BlockShape[0];
+    if (outputInfo.GetShape()[0] != 1 || outputInfo.GetShape()[3] != 1)
+    {
+        // The final output must have 1 batch and 1 channel because these dimensions will be swapped around
+        // once we make the substitution, and it needs to be equivalent.
+        return;
+    }
+
+    // Validate the intermediate tensor quantization params.
+    // These must be identical to either the input or output quantization params, otherwise the intermediate tensor
+    // may not have sufficient range/precision to preserve the values.
+    // This would mean that once we perform the substitution this loss of precision will no longer occur,
+    // so we would have changed the meaning of the network.
+    bool isIntermediateQuantParamsSameAsInput =
+        intermediateInfo.GetQuantizationScale() == inputInfo.GetQuantizationScale() &&
+        intermediateInfo.GetQuantizationOffset() == inputInfo.GetQuantizationOffset();
+    bool isIntermediateQuantParamsSameAsOutput =
+        intermediateInfo.GetQuantizationScale() == outputInfo.GetQuantizationScale() &&
+        intermediateInfo.GetQuantizationOffset() == outputInfo.GetQuantizationOffset();
+    if (!isIntermediateQuantParamsSameAsInput && !isIntermediateQuantParamsSameAsOutput)
+    {
+        return;
+    }
+
+    // Insert equivalent DepthToSpace layer
+    const std::string name = std::string("merged-") + base.GetName() + std::string("-with-") + child.GetName();
+
+    // Inserts equivalent reshape before base layer.
+    const DepthToSpaceDescriptor depthToSpaceDesc(blockSize, DataLayout::NHWC);
+    auto& depthToSpace = *graph.InsertNewLayer<DepthToSpaceLayer>(base.GetInputSlot(0), depthToSpaceDesc, name.c_str());
+    depthToSpace.GetOutputHandler().SetTensorInfo(outputInfo);
+
+    // Moves connections from child output to new layer.
+    // Child layer will be removed as it's left unconnected.
+    // Base layer will be removed if left unconnected.
+    child.GetOutputSlot().MoveAllConnections(depthToSpace.GetOutputSlot());
+}
diff --git a/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp b/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp
new file mode 100644
index 0000000000..4a73efca40
--- /dev/null
+++ b/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "Optimization.hpp"
+
+namespace armnn
+{
+namespace optimizations
+{
+
+/// Replaces Permute leading into BatchToSpace with a DepthToSpace
+/// in the case where the Permute swaps the batch and channels dimensions
+/// such that the replacement is valid.
+class PermuteAndBatchToSpaceAsDepthToSpaceImpl
+{
+public:
+    void Run(Graph& graph, InputSlot& connection) const;
+};
+
+using PermuteAndBatchToSpaceAsDepthToSpace =
+    OptimizeForConnection<PermuteLayer, BatchToSpaceNdLayer, PermuteAndBatchToSpaceAsDepthToSpaceImpl>;
+
+}    // namespace optimizations
+}    // namespace armnn
diff --git a/src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp b/src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
new file mode 100644
index 0000000000..ec1dd511c9
--- /dev/null
+++ b/src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
@@ -0,0 +1,132 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../TestUtils.hpp"
+
+#include <Network.hpp>
+#include <Optimizer.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Optimizer)
+using namespace armnn::optimizations;
+
+namespace
+{
+
+/// Shared function for the below tests, so that we test the same network in both cases.
+INetworkPtr CreateTestNetwork()
+{
+    // Create a network
+    INetworkPtr network = INetwork::Create();
+
+    auto input = network->AddInputLayer(0, "input");
+    const TensorInfo inputInfo({ 1, 2, 3, 4 }, DataType::Float32);
+    input->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+    // Insert Permute which swaps batches and channels dimensions
+    auto permute = network->AddPermuteLayer(PermuteDescriptor(PermutationVector{ 3, 1, 2, 0 }), "permute");
+    const TensorInfo permuteInfo({ 4, 2, 3, 1 }, DataType::Float32);
+    permute->GetOutputSlot(0).SetTensorInfo(permuteInfo);
+    input->GetOutputSlot(0).Connect(permute->GetInputSlot(0));
+
+    // Insert BatchToSpace
+    BatchToSpaceNdDescriptor batchToSpaceDesc;
+    batchToSpaceDesc.m_BlockShape = { 2, 2 };
+    batchToSpaceDesc.m_DataLayout = DataLayout::NHWC;
+    auto batchToSpace             = network->AddBatchToSpaceNdLayer(batchToSpaceDesc, "batchToSpace");
+    const TensorInfo batchToSpaceInfo({ 1, 4, 6, 1 }, DataType::Float32);
+    batchToSpace->GetOutputSlot(0).SetTensorInfo(batchToSpaceInfo);
+    permute->GetOutputSlot(0).Connect(batchToSpace->GetInputSlot(0));
+
+    auto output = network->AddOutputLayer(0, "output");
+    batchToSpace->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    return network;
+}
+
+}    // namespace
+
+/// Tests that the optimization performed by PermuteAndBatchToSpaceAsDepthToSpace is as expected.
+/// Note this does not ensure the correctness of the optimization - that is done in the below test.
+BOOST_AUTO_TEST_CASE(PermuteAndBatchToSpaceAsDepthToSpaceOptimizerTest)
+{
+    INetworkPtr network = CreateTestNetwork();
+    Graph graph         = static_cast<Network*>(network.get())->GetGraph();
+
+    // Confirm initial graph is as we expect
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<InputLayer>, &IsLayerOfType<PermuteLayer>,
+                             &IsLayerOfType<BatchToSpaceNdLayer>, &IsLayerOfType<OutputLayer>));
+
+    // Perform the optimization which should merge the two layers into a DepthToSpace
+    armnn::Optimizer::Pass(graph, MakeOptimizations(PermuteAndBatchToSpaceAsDepthToSpace()));
+
+    // Check that the replacement has been made as expected
+    auto checkDepthToSpace = [](const Layer* const layer) -> bool {
+        return IsLayerOfType<DepthToSpaceLayer>(layer) &&
+               static_cast<const DepthToSpaceLayer*>(layer)->GetParameters().m_BlockSize == 2 &&
+               static_cast<const DepthToSpaceLayer*>(layer)->GetParameters().m_DataLayout == DataLayout::NHWC &&
+               layer->GetOutputHandler().GetTensorInfo() == TensorInfo({ 1, 4, 6, 1 }, DataType::Float32);
+    };
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<InputLayer>, checkDepthToSpace,
+                             &IsLayerOfType<OutputLayer>));
+
+    // Check the new layer has the two merged layers listed as related layers
+    std::list<std::string> testRelatedLayers = { "batchToSpace", "permute" };
+    BOOST_TEST(CheckRelatedLayers<DepthToSpaceLayer>(graph, testRelatedLayers));
+}
+
+/// Tests that a optimization performed by PermuteAndBatchToSpaceAsDepthToSpace does not change the behaviour
+/// of the network (i.e. it still produces the correct output).
+BOOST_AUTO_TEST_CASE(PermuteAndBatchToSpaceAsDepthToSpaceCorrectnessTest)
+{
+    INetworkPtr network = CreateTestNetwork();
+
+    IRuntimePtr runtime = IRuntime::Create(IRuntime::CreationOptions());
+
+    IOptimizedNetworkPtr optimizedNetwork = Optimize(*network, { Compute::CpuRef }, runtime->GetDeviceSpec());
+
+    // Confirm that the optimization has actually taken place
+    const Graph& optGraph = static_cast<OptimizedNetwork*>(optimizedNetwork.get())->GetGraph();
+    BOOST_TEST(CheckSequence(optGraph.cbegin(), optGraph.cend(), &IsLayerOfType<InputLayer>,
+                             &IsLayerOfType<DepthToSpaceLayer>, &IsLayerOfType<OutputLayer>));
+
+    // Load the graph into a runtime so we can check it produces the correct output
+    NetworkId netId;
+    runtime->LoadNetwork(netId, std::move(optimizedNetwork));
+
+    std::vector<float> inputData{
+        // Each row here is a row of pixels where each pixel has 4 channels
+        // clang-format off
+        1.0f,  2.0f,  3.0f,  4.0f,      10.0f,  20.0f,  30.0f,  40.0f,      100.0f,  200.0f,  300.0f,  400.0f,
+        -1.0f, -2.0f, -3.0f, -4.0f,    -10.0f, -20.0f, -30.0f, -40.0f,     -100.0f, -200.0f, -300.0f, -400.0f,
+        // clang-format on
+    };
+    ConstTensor input(TensorInfo({ 1, 2, 3, 4 }, DataType::Float32), inputData);
+    InputTensors inputs = { { 0, input } };
+    std::vector<float> outputData(4 * 6);
+    Tensor output(TensorInfo({ 1, 4, 6, 1 }, DataType::Float32), outputData.data());
+    OutputTensors outputs = { { 0, output } };
+    runtime->EnqueueWorkload(netId, inputs, outputs);
+
+    // Check the output is as expected.
+    // Note this output has been generated by running the network *without* the optimization.
+    std::vector<float> expectedOutput = {
+        // Rows and columns here match exactly with the tensor, as there is only 1 channel.
+        // clang-format off
+        1.0f,  2.0f,     10.0f,  20.0f,     100.0f,  200.0f,
+        3.0f,  4.0f,     30.0f,  40.0f,     300.0f,  400.0f,
+
+        -1.0f, -2.0f,   -10.0f, -20.0f,    -100.0f, -200.0f,
+        -3.0f, -4.0f,   -30.0f, -40.0f,    -300.0f, -400.0f,
+        // clang-format on
+    };
+    BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+\ No newline at end of file