From 5fc0fd6661f9647092deb052d052973a237bd52d Mon Sep 17 00:00:00 2001
From: Matthew Sloyan <matthew.sloyan@arm.com>
Date: Mon, 3 May 2021 12:22:03 +0100
Subject: MLCE-418 Reduce layer does not support multiple axes

 * Added backend specific optimization to chain new reduces layers
   for each axis to simulate behaviour of a layer with multiple axes.
 * Added function to calculate reduced output shape.
 * Added unit tests.
 * Includes rework to fix IVGCVSW-5987.

Signed-off-by: Matthew Sloyan <matthew.sloyan@arm.com>
Change-Id: I154b3698b5e6756b05b2a0b5a3f0896184efce72
---
 Android.mk                                         |   1 +
 CMakeLists.txt                                     |   1 +
 .../test/optimizations/ReduceMultipleAxesTests.cpp | 293 +++++++++++++++++++++
 src/backends/aclCommon/ArmComputeSubgraphUtils.hpp |  84 ++++++
 src/backends/aclCommon/ArmComputeUtils.hpp         |  94 +++++++
 src/backends/cl/ClBackend.cpp                      |  21 ++
 src/backends/cl/workloads/ClReduceWorkload.cpp     |  34 +--
 src/backends/neon/NeonBackend.cpp                  |  21 ++
 src/backends/neon/workloads/NeonReduceWorkload.cpp |  34 ++-
 9 files changed, 554 insertions(+), 29 deletions(-)
 create mode 100644 src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp

diff --git a/Android.mk b/Android.mk
index d9230e5585..168b32a400 100644
--- a/Android.mk
+++ b/Android.mk
@@ -393,6 +393,7 @@ LOCAL_SRC_FILES := \
         src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp \
         src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp \
         src/armnn/test/optimizations/PermuteAsReshapeTests.cpp \
+        src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp \
         src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp \
         src/armnn/test/optimizations/TransposeAsReshapeTests.cpp \
         src/armnn/test/OptimizerTests.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dfdff89bbe..1e201ea0ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -566,6 +566,7 @@ if(BUILD_UNIT_TESTS)
         src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp
         src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
         src/armnn/test/optimizations/PermuteAsReshapeTests.cpp
+        src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp
         src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp
         src/armnn/test/optimizations/TransposeAsReshapeTests.cpp
         src/armnn/test/OptionalTest.cpp
diff --git a/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp b/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp
new file mode 100644
index 0000000000..b42c0a2cfb
--- /dev/null
+++ b/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp
@@ -0,0 +1,293 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../GraphUtils.hpp"
+#include "../TestUtils.hpp"
+
+#include <armnn/INetwork.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Optimizer)
+
+INetworkPtr CreateSimpleReduceNetwork(ReduceDescriptor reduceDescriptor,
+                                      TensorShape& inputShape,
+                                      TensorShape& outputShape)
+{
+    // Create a network
+    INetworkPtr network = INetwork::Create();
+
+    const std::string layerName("reduce_layer");
+    const TensorInfo inputInfo (inputShape, DataType::Float32);
+    const TensorInfo outputInfo(outputShape, DataType::Float32);
+
+    IConnectableLayer* const inputLayer  = network->AddInputLayer(0);
+    IConnectableLayer* const reduceLayer = network->AddReduceLayer(reduceDescriptor, layerName.c_str());
+    IConnectableLayer* const outputLayer1 = network->AddOutputLayer(0);
+    IConnectableLayer* const outputLayer2 = network->AddOutputLayer(1);
+
+    inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+    reduceLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
+
+    inputLayer->GetOutputSlot(0).Connect(reduceLayer->GetInputSlot(0));
+    reduceLayer->GetOutputSlot(0).Connect(outputLayer1->GetInputSlot(0));
+    reduceLayer->GetOutputSlot(0).Connect(outputLayer2->GetInputSlot(0));
+
+    return network;
+}
+
+void ReduceWithMultipleAxesTest(INetworkPtr& network,
+                                const TensorShape& outputShape,
+                                const std::vector<float>& inputData,
+                                const std::vector<float>& expectedOutput,
+                                const size_t numOfAxes,
+                                Compute backendId)
+{
+    // Create ArmNN runtime
+    IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions());
+
+    // Optimise ArmNN network
+    IOptimizedNetworkPtr optNet = Optimize(*network, {backendId}, run->GetDeviceSpec());
+
+    Graph& graph = GetGraphForTesting(optNet.get());
+    if (numOfAxes == 2)
+    {
+        BOOST_CHECK(graph.GetNumLayers() == 5);
+        BOOST_TEST(CheckSequence(graph.cbegin(),
+                                 graph.cend(),
+                                 &IsLayerOfType<InputLayer>,
+                                 &IsLayerOfType<ReduceLayer>,
+                                 &IsLayerOfType<ReduceLayer>,
+                                 &IsLayerOfType<OutputLayer>,
+                                 &IsLayerOfType<OutputLayer>));
+    }
+    else
+    {
+        BOOST_CHECK(graph.GetNumLayers() == 6);
+        BOOST_TEST(CheckSequence(graph.cbegin(),
+                                 graph.cend(),
+                                 &IsLayerOfType<InputLayer>,
+                                 &IsLayerOfType<ReduceLayer>,
+                                 &IsLayerOfType<ReduceLayer>,
+                                 &IsLayerOfType<ReduceLayer>,
+                                 &IsLayerOfType<OutputLayer>,
+                                 &IsLayerOfType<OutputLayer>));
+    }
+
+    // Get last layer in new chain, layers name follow 0, 1, 2 pattern
+    std::string layerName = "reduce_layer_" + std::to_string(numOfAxes - 1);
+    Layer* const reduceLayer = GetFirstLayerWithName(graph, layerName);
+    BOOST_TEST(reduceLayer);
+    auto reduceTensorInfo = reduceLayer->GetOutputSlot().GetTensorInfo();
+
+    // Tensorshape and the data type are correct
+    BOOST_TEST((reduceTensorInfo.GetShape() == outputShape));
+    BOOST_TEST((reduceTensorInfo.GetDataType() == DataType::Float32));
+
+    // Load network into runtime
+    NetworkId networkIdentifier;
+    run->LoadNetwork(networkIdentifier, std::move(optNet));
+
+    // Create input and output tensors
+    std::vector<float> outputData(expectedOutput.size());
+    InputTensors inputTensors
+    {
+        { 0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), inputData.data()) }
+    };
+    OutputTensors outputTensors
+    {
+        { 0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData.data()) },
+        { 1, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 1), outputData.data()) }
+    };
+
+    // Run inference
+    run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
+
+    // Checks the results
+    BOOST_TEST(outputData == expectedOutput);
+}
+
+void ReduceSumWithTwoAxesKeepDimsTest(Compute backendId)
+{
+    armnn::ReduceDescriptor reduceDescriptor;
+    reduceDescriptor.m_vAxis = { 1, 2 };
+    reduceDescriptor.m_KeepDims = true;
+    reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+    TensorShape inputShape  = { 1, 3, 2, 4 };
+    TensorShape outputShape = { 1, 1, 1, 4 };
+
+    // Construct ArmNN network
+    INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+    // Creates structures for input & output.
+    const std::vector<float> inputData({  1.0f,   2.0f,   3.0f,   4.0f,
+                                          5.0f,   6.0f,   7.0f,   8.0f,
+
+                                          10.0f,  20.0f,  30.0f,  40.0f,
+                                          50.0f,  60.0f,  70.0f,  80.0f,
+
+                                          100.0f, 200.0f, 300.0f, 400.0f,
+                                          500.0f, 600.0f, 700.0f, 800.0f });
+    const std::vector<float> expectedOutput({ 666.0f, 888.0f, 1110.0f, 1332.0f });
+
+    ReduceWithMultipleAxesTest(network,
+                               outputShape,
+                               inputData,
+                               expectedOutput,
+                               reduceDescriptor.m_vAxis.size(),
+                               backendId);
+}
+
+void ReduceSumWithTwoAxesTest(Compute backendId)
+{
+    armnn::ReduceDescriptor reduceDescriptor;
+    reduceDescriptor.m_vAxis = { 1, 2 };
+    reduceDescriptor.m_KeepDims = false;
+    reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+    TensorShape inputShape  = { 1, 3, 2, 4 };
+    TensorShape outputShape = { 1, 4 };
+
+    // Construct ArmNN network
+    INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+    // Creates structures for input & output.
+    const std::vector<float> inputData({  1.0f,   2.0f,   3.0f,   4.0f,
+                                          5.0f,   6.0f,   7.0f,   8.0f,
+
+                                          10.0f,  20.0f,  30.0f,  40.0f,
+                                          50.0f,  60.0f,  70.0f,  80.0f,
+
+                                          100.0f, 200.0f, 300.0f, 400.0f,
+                                          500.0f, 600.0f, 700.0f, 800.0f });
+    const std::vector<float> expectedOutput({ 666.0f, 888.0f, 1110.0f, 1332.0f });
+
+    ReduceWithMultipleAxesTest(network,
+                               outputShape,
+                               inputData,
+                               expectedOutput,
+                               reduceDescriptor.m_vAxis.size(),
+                               backendId);
+}
+
+void ReduceSumWithThreeAxesKeepDimsTest(Compute backendId)
+{
+    armnn::ReduceDescriptor reduceDescriptor;
+    reduceDescriptor.m_vAxis = { 0, 2, 3 };
+    reduceDescriptor.m_KeepDims = true;
+    reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+    TensorShape inputShape  = { 2, 2, 2, 2 };
+    TensorShape outputShape = { 1, 2, 1, 1 };
+
+    // Construct ArmNN network
+    INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+    // Creates structures for input & output.
+    const std::vector<float> inputData({  1.0f,   2.0f,
+                                          3.0f,   4.0f,
+
+                                          5.0f,   6.0f,
+                                          7.0f,   8.0f,
+
+                                          10.0f,  20.0f,
+                                          30.0f,  40.0f,
+
+                                          50.0f,  60.0f,
+                                          70.0f,  80.0f });
+    const std::vector<float> expectedOutput({ 110.0f, 286.0f });
+
+    ReduceWithMultipleAxesTest(network,
+                               outputShape,
+                               inputData,
+                               expectedOutput,
+                               reduceDescriptor.m_vAxis.size(),
+                               backendId);
+}
+
+void ReduceSumWithThreeAxesTest(Compute backendId)
+{
+    armnn::ReduceDescriptor reduceDescriptor;
+    reduceDescriptor.m_vAxis = { 0, 2, 3 };
+    reduceDescriptor.m_KeepDims = false;
+    reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+    TensorShape inputShape  = { 2, 2, 2, 2 };
+    TensorShape outputShape = { 2 };
+
+    // Construct ArmNN network
+    INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+    // Creates structures for input & output.
+    const std::vector<float> inputData({  1.0f,   2.0f,
+                                          3.0f,   4.0f,
+
+                                          5.0f,   6.0f,
+                                          7.0f,   8.0f,
+
+                                          10.0f,  20.0f,
+                                          30.0f,  40.0f,
+
+                                          50.0f,  60.0f,
+                                          70.0f,  80.0f });
+    const std::vector<float> expectedOutput({ 110.0f, 286.0f });
+
+    ReduceWithMultipleAxesTest(network,
+                               outputShape,
+                               inputData,
+                               expectedOutput,
+                               reduceDescriptor.m_vAxis.size(),
+                               backendId);
+}
+
+using namespace armnn;
+#if defined(ARMCOMPUTENEON_ENABLED)
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesKeepDimsCpuAccTest)
+{
+    ReduceSumWithTwoAxesKeepDimsTest(Compute::CpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesCpuAccTest)
+{
+    ReduceSumWithTwoAxesTest(Compute::CpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesKeepDimsCpuAccTest)
+{
+    ReduceSumWithThreeAxesKeepDimsTest(Compute::CpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesCpuAccTest)
+{
+    ReduceSumWithThreeAxesTest(Compute::CpuAcc);
+}
+#endif
+
+#if defined(ARMCOMPUTECL_ENABLED)
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesKeepDimsGpuAccTest)
+{
+    ReduceSumWithTwoAxesKeepDimsTest(Compute::GpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesGpuAccTest)
+{
+    ReduceSumWithTwoAxesTest(Compute::GpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesKeepDimsGpuAccTest)
+{
+    ReduceSumWithThreeAxesKeepDimsTest(Compute::GpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesGpuAccTest)
+{
+    ReduceSumWithThreeAxesTest(Compute::GpuAcc);
+}
+#endif
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
index a0fca46330..521c17cd62 100644
--- a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
@@ -6,6 +6,9 @@
 #pragma once
 
 #include <armnn/backends/OptimizationViews.hpp>
+#include <armnn/utility/Assert.hpp>
+
+#include <aclCommon/ArmComputeUtils.hpp>
 
 namespace armnn
 {
@@ -147,4 +150,85 @@ LayerType* FuseLayerWithWeightsAndBiases(OptimizationViews& optimizationViews,
     return replacementLayer;
 }
 
+//
+// If reduce layer has multiple axes, add new layer for each axis to simulate the same behaviour
+// as currently only one axis is supported.
+//
+template<typename LayerType>
+std::vector<Layer*> ChainReduceLayers(OptimizationViews& optimizationViews,
+                                      LayerType* baseLayer,
+                                      ReduceDescriptor& desc)
+{
+    // Vector of new chained layers, used for substitution.
+    std::vector<Layer*> layers;
+
+    // Vector of axes so each layer is reshaped correctly.
+    std::vector<uint32_t> axes;
+    unsigned int recalulatedAxis = 0;
+
+    for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i)
+    {
+        // Get TensorInfo from base layer and reduce shape using axis.
+        TensorInfo layerInfo = baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+
+        axes.emplace_back(desc.m_vAxis[i]);
+
+        const TensorInfo& reducedTensorInfo = ComputeReductionTensorShape(layerInfo,
+                                                                          axes,
+                                                                          desc.m_KeepDims);
+
+        // Create a vector for the single axis to be assigned to the descriptor.
+        // Update axis if keepDims is set reduce layers correctly.
+        std::vector<uint32_t> singleAxis(1, desc.m_vAxis[i] - recalulatedAxis);
+
+        // Create a descriptor and assign single axis.
+        ReduceDescriptor newReduceDescriptor = baseLayer->GetParameters();
+        newReduceDescriptor.m_vAxis.assign(singleAxis.begin(), singleAxis.end());
+
+        // Add new layer to graph.
+        std::string layerName = "reduce_layer_" + std::to_string(i);
+        Layer* replacementLayer = optimizationViews.GetGraph().AddLayer<LayerType>(newReduceDescriptor,
+                                                                                   layerName.c_str());
+        // Connect previous layer with new layer.
+        // The first and last layer will be connected when the subgraph is replaced.
+        if (!layers.empty())
+        {
+            layers[i - 1]->GetOutputSlot(0).Connect(replacementLayer->GetInputSlot(0));
+        }
+
+        // Set updated tensorInfo for new layer.
+        replacementLayer->GetOutputSlot(0).SetTensorInfo(reducedTensorInfo);
+
+        if (!desc.m_KeepDims)
+        {
+            recalulatedAxis++;
+        }
+
+        layers.emplace_back(replacementLayer);
+    }
+
+    // Check if the TensorInfo from the last layer equals the inferred output from the original layer.
+    ARMNN_ASSERT(baseLayer->GetOutputSlot(0).GetTensorInfo() == layers.back()->GetOutputSlot().GetTensorInfo());
+
+    return layers;
+}
+
+//
+// Substitute baseLayer with new subgraph
+//
+template<typename LayerType>
+void ReplaceLayers(OptimizationViews& optimizationViews,
+                   LayerType* baseLayer,
+                   std::vector<Layer*>& layers)
+{
+    std::list<Layer*> replacementLayers(layers.begin(), layers.end());
+
+    SubgraphView substitutionSubgraph(baseLayer);
+    SubgraphView replacementSubgraph(CreateInputsFrom({replacementLayers.front()}),
+                                     CreateOutputsFrom({replacementLayers.back()}),
+                                     std::move(replacementLayers));
+
+    optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph});
+}
+
 } // namespace armnn
diff --git a/src/backends/aclCommon/ArmComputeUtils.hpp b/src/backends/aclCommon/ArmComputeUtils.hpp
index d9efab288f..624ce5df7a 100644
--- a/src/backends/aclCommon/ArmComputeUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeUtils.hpp
@@ -7,10 +7,19 @@
 #include <armnn/Descriptors.hpp>
 #include <armnn/Tensor.hpp>
 #include <armnn/utility/Assert.hpp>
+#include <armnn/utility/NumericCast.hpp>
 #include <backendsCommon/WorkloadData.hpp>
 
 #include <arm_compute/core/Types.h>
 
+#if defined(ARMCOMPUTENEON_ENABLED)
+#include "neon/workloads/NeonReduceWorkload.hpp"
+#endif
+
+#if defined(ARMCOMPUTECL_ENABLED)
+#include "cl/workloads/ClReduceWorkload.hpp"
+#endif
+
 namespace armnn
 {
 
@@ -267,4 +276,89 @@ inline arm_compute::ReductionOperation ConvertReductionOperationToAcl(const Redu
     }
 }
 
+/// Function to compute the output tensor shape based on the axes and if keepDims is set.
+inline const TensorInfo ComputeReductionTensorShape(const armnn::TensorInfo& input,
+                                                    const std::vector<uint32_t>& vAxis,
+                                                    const bool keepDims)
+{
+    auto reducedTensorInfo = input;
+    unsigned int rank = reducedTensorInfo.GetNumDimensions();
+    unsigned int outputRank = 0;
+    // Calculate output dimension
+    if (keepDims)
+    {
+        outputRank = rank;
+    }
+    else if (vAxis.empty())
+    {
+        outputRank = 1;
+    }
+    else if (vAxis.size() > reducedTensorInfo.GetNumDimensions())
+    {
+        throw LayerValidationException("ReduceLayer: Dimensions to reduce can not be bigger than input dimensions");
+    }
+    else
+    {
+        outputRank = reducedTensorInfo.GetNumDimensions() - armnn::numeric_cast<unsigned int>(vAxis.size());
+        if (outputRank == 0)
+        {
+            outputRank = 1;
+        }
+    }
+    std::vector<unsigned int> dimSizes(outputRank, 1);
+    if (!vAxis.empty())
+    {
+        // Skip the dimension that has been reduced unless keepDims is true.
+        unsigned int outputIndex = 0;
+        for (unsigned int i = 0; i < reducedTensorInfo.GetNumDimensions(); ++i)
+        {
+            if (std::find(vAxis.begin(), vAxis.end(), i) == vAxis.end())
+            {
+                dimSizes[outputIndex] = armnn::numeric_cast<unsigned int>(reducedTensorInfo.GetShape()[i]);
+                ++outputIndex;
+            }
+            else if (keepDims)
+            {
+                dimSizes[outputIndex] = 1;
+                ++outputIndex;
+            }
+        }
+    }
+    const TensorShape inferredShape = TensorShape(outputRank, dimSizes.data());
+    reducedTensorInfo.SetShape(inferredShape);
+    return reducedTensorInfo;
+}
+
+/// Macro function check if layer with multiple axes is supported on each backend
+#define IS_MULTI_AXES_REDUCE_SUPPORTED(func, input, desc, status)                 \
+    armnn::TensorInfo inputTensorInfo = input;                                    \
+    unsigned int recalulatedAxis = 0;                                             \
+    std::vector<uint32_t> axes;                                                   \
+                                                                                  \
+    for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i)                       \
+    {                                                                             \
+        axes.emplace_back(desc.m_vAxis[i]);                                       \
+                                                                                  \
+        const armnn::TensorInfo& reducedTensorInfo =                              \
+            ComputeReductionTensorShape(input, axes, desc.m_KeepDims);            \
+                                                                                  \
+        std::vector<uint32_t> singleAxis(1, desc.m_vAxis[i] - recalulatedAxis);   \
+                                                                                  \
+        armnn::ReduceDescriptor newReduceDescriptor = desc;                       \
+        newReduceDescriptor.m_vAxis.assign(singleAxis.begin(), singleAxis.end()); \
+                                                                                  \
+        status = func(inputTensorInfo, reducedTensorInfo, newReduceDescriptor);   \
+        if (!status)                                                              \
+        {                                                                         \
+            break;                                                                \
+        }                                                                         \
+                                                                                  \
+        if (!desc.m_KeepDims)                                                     \
+        {                                                                         \
+            recalulatedAxis++;                                                    \
+        }                                                                         \
+                                                                                  \
+        inputTensorInfo = reducedTensorInfo;                                      \
+    }
+
 } // namespace armnn
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index 35770d9219..a9ab237325 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -30,6 +30,7 @@
 #include "workloads/ClDivisionWorkload.hpp"
 #include "workloads/ClFullyConnectedWorkload.hpp"
 #include "workloads/ClMultiplicationWorkload.hpp"
+#include "workloads/ClReduceWorkload.hpp"
 #include "workloads/ClSubtractionWorkload.hpp"
 
 #include <Optimizer.hpp>
@@ -220,6 +221,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
         --it;
         Layer& base = **it;
 
+        // Fuse activation into previous layer if supported by backend
         if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
             || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
             || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
@@ -451,6 +453,25 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
                 }
             }
         }
+
+        // Separate reduce layer with multiple axes into multiple reduce layers with 1 axis.
+        if (base.GetType() == LayerType::Reduce)
+        {
+            ReduceLayer* baseLayer            = PolymorphicDowncast<ReduceLayer*>(&base);
+            ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
+
+            if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1)
+            {
+                // Add new layers to the graph and connect them.
+                std::vector<Layer*> layers = ChainReduceLayers<ReduceLayer>(optimizationViews,
+                                                                            baseLayer,
+                                                                            reduceDescriptor);
+
+                // Replace existing baselayer with new subgraph.
+                ReplaceLayers<ReduceLayer>(optimizationViews, baseLayer, layers);
+                untouched.erase(baseLayer->GetGuid());
+            }
+        }
     }
 
     if (optimizationViews.GetSubstitutions().empty())
diff --git a/src/backends/cl/workloads/ClReduceWorkload.cpp b/src/backends/cl/workloads/ClReduceWorkload.cpp
index 6f594ff7a9..18415c4cba 100644
--- a/src/backends/cl/workloads/ClReduceWorkload.cpp
+++ b/src/backends/cl/workloads/ClReduceWorkload.cpp
@@ -19,24 +19,28 @@ arm_compute::Status ClReduceWorkloadValidate(const TensorInfo& input,
                                              const TensorInfo& output,
                                              const ReduceDescriptor& desc)
 {
-    const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
-    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
-    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+    if ( desc.m_vAxis.size()==1 || desc.m_vAxis.empty())
     {
-        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
-                                   "ClReduceWorkload: Reduction is supported only on 1 axis.");
-    }
-
-    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
-                                                                          input.GetNumDimensions(),
-                                                                          desc.m_vAxis);
+        const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+        const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
 
+        arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
+                                                                              input.GetNumDimensions(),
+                                                                              desc.m_vAxis);
 
-    return arm_compute::CLReductionOperation::validate(&aclInputInfo,
-                                                       &aclOutputInfo,
-                                                       static_cast<unsigned int>(coords[0]),
-                                                       ConvertReductionOperationToAcl(desc),
-                                                       desc.m_KeepDims);
+        return arm_compute::CLReductionOperation::validate(&aclInputInfo,
+                                                           &aclOutputInfo,
+                                                           static_cast<unsigned int>(coords[0]),
+                                                           ConvertReductionOperationToAcl(desc),
+                                                           desc.m_KeepDims);
+    }
+    else
+    {
+        // Validate layer if there are multiple axes.
+        arm_compute::Status status;
+        IS_MULTI_AXES_REDUCE_SUPPORTED(NeonReduceWorkloadValidate, input, desc, status);
+        return status;
+    }
 }
 
 ClReduceWorkload::ClReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp
index a1299fb458..b496238cf3 100644
--- a/src/backends/neon/NeonBackend.cpp
+++ b/src/backends/neon/NeonBackend.cpp
@@ -29,6 +29,7 @@
 #include "workloads/NeonDivisionWorkload.hpp"
 #include "workloads/NeonFullyConnectedWorkload.hpp"
 #include "workloads/NeonMultiplicationWorkload.hpp"
+#include "workloads/NeonReduceWorkload.hpp"
 #include "workloads/NeonSubtractionWorkload.hpp"
 
 #include <Optimizer.hpp>
@@ -161,6 +162,7 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph
         --it;
         Layer& base = **it;
 
+        // Fuse activation into previous layer if supported by backend
         if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
              || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
              || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
@@ -393,6 +395,25 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph
                 }
             }
         }
+
+        // Separate reduce layer with multiple axes into multiple reduce layers with 1 axis.
+        if (base.GetType() == LayerType::Reduce)
+        {
+            ReduceLayer* baseLayer            = PolymorphicDowncast<ReduceLayer*>(&base);
+            ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
+
+            if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1)
+            {
+                // Add new layers to the graph and connect them.
+                std::vector<Layer*> layers = ChainReduceLayers<ReduceLayer>(optimizationViews,
+                                                                            baseLayer,
+                                                                            reduceDescriptor);
+
+                // Replace existing baselayer with new subgraph.
+                ReplaceLayers<ReduceLayer>(optimizationViews, baseLayer, layers);
+                untouched.erase(baseLayer->GetGuid());
+            }
+        }
     }
 
     if (optimizationViews.GetSubstitutions().empty())
diff --git a/src/backends/neon/workloads/NeonReduceWorkload.cpp b/src/backends/neon/workloads/NeonReduceWorkload.cpp
index 0e1b46a3a1..1436cd1192 100644
--- a/src/backends/neon/workloads/NeonReduceWorkload.cpp
+++ b/src/backends/neon/workloads/NeonReduceWorkload.cpp
@@ -20,23 +20,28 @@ arm_compute::Status NeonReduceWorkloadValidate(const TensorInfo& input,
                                                const TensorInfo& output,
                                                const ReduceDescriptor& desc)
 {
-    const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
-    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
-    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+    if ( desc.m_vAxis.size()==1 || desc.m_vAxis.empty())
     {
-        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
-                                   "NeonReduceWorkload: Reduction is supported only on 1 axis.");
-    }
+        const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+        const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
 
-    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
-                                                                          input.GetNumDimensions(),
-                                                                          desc.m_vAxis);
+        arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
+                                                                              input.GetNumDimensions(),
+                                                                              desc.m_vAxis);
 
-    return arm_compute::NEReductionOperation::validate(&aclInputInfo,
-                                                       &aclOutputInfo,
-                                                       static_cast<unsigned int>(coords[0]),
-                                                       ConvertReductionOperationToAcl(desc),
-                                                       desc.m_KeepDims);
+        return arm_compute::NEReductionOperation::validate(&aclInputInfo,
+                                                           &aclOutputInfo,
+                                                           static_cast<unsigned int>(coords[0]),
+                                                           ConvertReductionOperationToAcl(desc),
+                                                           desc.m_KeepDims);
+    }
+    else
+    {
+        // Validate layer if there are multiple axes.
+        arm_compute::Status status;
+        IS_MULTI_AXES_REDUCE_SUPPORTED(NeonReduceWorkloadValidate, input, desc, status);
+        return status;
+    }
 }
 
 NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
@@ -50,6 +55,7 @@ NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor,
     arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(input.info()->num_dimensions(),
                                                                           info.m_InputTensorInfos[0].GetNumDimensions(),
                                                                           m_Data.m_Parameters.m_vAxis);
+
     m_Layer.configure(&input,
                       &output,
                       static_cast<unsigned int>(coords[0]),
-- 
cgit v1.2.1