aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Sloyan <matthew.sloyan@arm.com>2021-05-03 12:22:03 +0100
committerMatthew Sloyan <matthew.sloyan@arm.com>2021-05-06 17:58:26 +0000
commitd905decd256558bbee165e636ce4242ac3b9c917 (patch)
tree86f51622399553d1741b66ff232a429de8fc43f8
parent1f58f03d82c482626b1b4673b6c0e25da4338fb5 (diff)
downloadarmnn-d905decd256558bbee165e636ce4242ac3b9c917.tar.gz
MLCE-418 Reduce layer does not support multiple axes
* Added backend specific optimization to chain new reduces layers for each axis to simulate behaviour of a layer with multiple axes. * Added function to calculate reduced output shape. * Added unit tests. Signed-off-by: Matthew Sloyan <matthew.sloyan@arm.com> Change-Id: I180b0b111b7bcf3d0c283f1db0b82d5f17757682
-rw-r--r--Android.mk1
-rw-r--r--CMakeLists.txt1
-rw-r--r--src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp288
-rw-r--r--src/backends/aclCommon/ArmComputeSubgraphUtils.hpp85
-rw-r--r--src/backends/aclCommon/ArmComputeUtils.hpp55
-rw-r--r--src/backends/cl/ClBackend.cpp24
-rw-r--r--src/backends/cl/workloads/ClReduceWorkload.cpp51
-rw-r--r--src/backends/neon/NeonBackend.cpp24
-rw-r--r--src/backends/neon/workloads/NeonReduceWorkload.cpp53
9 files changed, 558 insertions, 24 deletions
diff --git a/Android.mk b/Android.mk
index d9230e5585..168b32a400 100644
--- a/Android.mk
+++ b/Android.mk
@@ -393,6 +393,7 @@ LOCAL_SRC_FILES := \
src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp \
src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp \
src/armnn/test/optimizations/PermuteAsReshapeTests.cpp \
+ src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp \
src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp \
src/armnn/test/optimizations/TransposeAsReshapeTests.cpp \
src/armnn/test/OptimizerTests.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86dad31a21..c85ec2caf7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -563,6 +563,7 @@ if(BUILD_UNIT_TESTS)
src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp
src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
src/armnn/test/optimizations/PermuteAsReshapeTests.cpp
+ src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp
src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp
src/armnn/test/optimizations/TransposeAsReshapeTests.cpp
src/armnn/test/OptionalTest.cpp
diff --git a/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp b/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp
new file mode 100644
index 0000000000..28c0ea1466
--- /dev/null
+++ b/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp
@@ -0,0 +1,288 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../GraphUtils.hpp"
+#include "../TestUtils.hpp"
+
+#include <armnn/INetwork.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Optimizer)
+
+INetworkPtr CreateSimpleReduceNetwork(ReduceDescriptor reduceDescriptor,
+ TensorShape& inputShape,
+ TensorShape& outputShape)
+{
+ // Create a network
+ INetworkPtr network = INetwork::Create();
+
+ const std::string layerName("reduce_layer");
+ const TensorInfo inputInfo (inputShape, DataType::Float32);
+ const TensorInfo outputInfo(outputShape, DataType::Float32);
+
+ IConnectableLayer* const inputLayer = network->AddInputLayer(0);
+ IConnectableLayer* const reduceLayer = network->AddReduceLayer(reduceDescriptor, layerName.c_str());
+ IConnectableLayer* const outputLayer = network->AddOutputLayer(0);
+
+ inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+ reduceLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
+
+ inputLayer->GetOutputSlot(0).Connect(reduceLayer->GetInputSlot(0));
+ reduceLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+ return network;
+}
+
+void ReduceWithMultipleAxesTest(INetworkPtr& network,
+ const TensorShape& outputShape,
+ const std::vector<float>& inputData,
+ const std::vector<float>& expectedOutput,
+ const size_t numOfAxes,
+ Compute backendId)
+{
+ // Create ArmNN runtime
+ IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions());
+
+ // Optimise ArmNN network
+ IOptimizedNetworkPtr optNet = Optimize(*network, {backendId}, run->GetDeviceSpec());
+
+ Graph& graph = GetGraphForTesting(optNet.get());
+ if (numOfAxes == 2)
+ {
+ BOOST_CHECK(graph.GetNumLayers() == 4);
+ BOOST_TEST(CheckSequence(graph.cbegin(),
+ graph.cend(),
+ &IsLayerOfType<InputLayer>,
+ &IsLayerOfType<ReduceLayer>,
+ &IsLayerOfType<ReduceLayer>,
+ &IsLayerOfType<OutputLayer>));
+ }
+ else
+ {
+ BOOST_CHECK(graph.GetNumLayers() == 5);
+ BOOST_TEST(CheckSequence(graph.cbegin(),
+ graph.cend(),
+ &IsLayerOfType<InputLayer>,
+ &IsLayerOfType<ReduceLayer>,
+ &IsLayerOfType<ReduceLayer>,
+ &IsLayerOfType<ReduceLayer>,
+ &IsLayerOfType<OutputLayer>));
+ }
+
+ // Get last layer in new chain, layers name follow 0, 1, 2 pattern
+ std::string layerName = "reduce_layer_" + std::to_string(numOfAxes - 1);
+ Layer* const reduceLayer = GetFirstLayerWithName(graph, layerName);
+ BOOST_TEST(reduceLayer);
+ auto reduceTensorInfo = reduceLayer->GetOutputSlot().GetTensorInfo();
+
+ // Tensorshape and the data type are correct
+ BOOST_TEST((reduceTensorInfo.GetShape() == outputShape));
+ BOOST_TEST((reduceTensorInfo.GetDataType() == DataType::Float32));
+
+ // Load network into runtime
+ NetworkId networkIdentifier;
+ run->LoadNetwork(networkIdentifier, std::move(optNet));
+
+ // Create input and output tensors
+ std::vector<float> outputData(expectedOutput.size());
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), inputData.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData.data()) }
+ };
+
+ // Run inference
+ run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
+
+ // Checks the results
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+void ReduceSumWithTwoAxesKeepDimsTest(Compute backendId)
+{
+ armnn::ReduceDescriptor reduceDescriptor;
+ reduceDescriptor.m_vAxis = { 1, 2 };
+ reduceDescriptor.m_KeepDims = true;
+ reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+ TensorShape inputShape = { 1, 3, 2, 4 };
+ TensorShape outputShape = { 1, 1, 1, 4 };
+
+ // Construct ArmNN network
+ INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+ // Creates structures for input & output.
+ const std::vector<float> inputData({ 1.0f, 2.0f, 3.0f, 4.0f,
+ 5.0f, 6.0f, 7.0f, 8.0f,
+
+ 10.0f, 20.0f, 30.0f, 40.0f,
+ 50.0f, 60.0f, 70.0f, 80.0f,
+
+ 100.0f, 200.0f, 300.0f, 400.0f,
+ 500.0f, 600.0f, 700.0f, 800.0f });
+ const std::vector<float> expectedOutput({ 666.0f, 888.0f, 1110.0f, 1332.0f });
+
+ ReduceWithMultipleAxesTest(network,
+ outputShape,
+ inputData,
+ expectedOutput,
+ reduceDescriptor.m_vAxis.size(),
+ backendId);
+}
+
+void ReduceSumWithTwoAxesTest(Compute backendId)
+{
+ armnn::ReduceDescriptor reduceDescriptor;
+ reduceDescriptor.m_vAxis = { 1, 2 };
+ reduceDescriptor.m_KeepDims = false;
+ reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+ TensorShape inputShape = { 1, 3, 2, 4 };
+ TensorShape outputShape = { 1, 4 };
+
+ // Construct ArmNN network
+ INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+ // Creates structures for input & output.
+ const std::vector<float> inputData({ 1.0f, 2.0f, 3.0f, 4.0f,
+ 5.0f, 6.0f, 7.0f, 8.0f,
+
+ 10.0f, 20.0f, 30.0f, 40.0f,
+ 50.0f, 60.0f, 70.0f, 80.0f,
+
+ 100.0f, 200.0f, 300.0f, 400.0f,
+ 500.0f, 600.0f, 700.0f, 800.0f });
+ const std::vector<float> expectedOutput({ 666.0f, 888.0f, 1110.0f, 1332.0f });
+
+ ReduceWithMultipleAxesTest(network,
+ outputShape,
+ inputData,
+ expectedOutput,
+ reduceDescriptor.m_vAxis.size(),
+ backendId);
+}
+
+void ReduceSumWithThreeAxesKeepDimsTest(Compute backendId)
+{
+ armnn::ReduceDescriptor reduceDescriptor;
+ reduceDescriptor.m_vAxis = { 0, 2, 3 };
+ reduceDescriptor.m_KeepDims = true;
+ reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+ TensorShape inputShape = { 2, 2, 2, 2 };
+ TensorShape outputShape = { 1, 2, 1, 1 };
+
+ // Construct ArmNN network
+ INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+ // Creates structures for input & output.
+ const std::vector<float> inputData({ 1.0f, 2.0f,
+ 3.0f, 4.0f,
+
+ 5.0f, 6.0f,
+ 7.0f, 8.0f,
+
+ 10.0f, 20.0f,
+ 30.0f, 40.0f,
+
+ 50.0f, 60.0f,
+ 70.0f, 80.0f });
+ const std::vector<float> expectedOutput({ 110.0f, 286.0f });
+
+ ReduceWithMultipleAxesTest(network,
+ outputShape,
+ inputData,
+ expectedOutput,
+ reduceDescriptor.m_vAxis.size(),
+ backendId);
+}
+
+void ReduceSumWithThreeAxesTest(Compute backendId)
+{
+ armnn::ReduceDescriptor reduceDescriptor;
+ reduceDescriptor.m_vAxis = { 0, 2, 3 };
+ reduceDescriptor.m_KeepDims = false;
+ reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum;
+
+ TensorShape inputShape = { 2, 2, 2, 2 };
+ TensorShape outputShape = { 2 };
+
+ // Construct ArmNN network
+ INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape);
+
+ // Creates structures for input & output.
+ const std::vector<float> inputData({ 1.0f, 2.0f,
+ 3.0f, 4.0f,
+
+ 5.0f, 6.0f,
+ 7.0f, 8.0f,
+
+ 10.0f, 20.0f,
+ 30.0f, 40.0f,
+
+ 50.0f, 60.0f,
+ 70.0f, 80.0f });
+ const std::vector<float> expectedOutput({ 110.0f, 286.0f });
+
+ ReduceWithMultipleAxesTest(network,
+ outputShape,
+ inputData,
+ expectedOutput,
+ reduceDescriptor.m_vAxis.size(),
+ backendId);
+}
+
+using namespace armnn;
+#if defined(ARMCOMPUTENEON_ENABLED)
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesKeepDimsCpuAccTest)
+{
+ ReduceSumWithTwoAxesKeepDimsTest(Compute::CpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesCpuAccTest)
+{
+ ReduceSumWithTwoAxesTest(Compute::CpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesKeepDimsCpuAccTest)
+{
+ ReduceSumWithThreeAxesKeepDimsTest(Compute::CpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesCpuAccTest)
+{
+ ReduceSumWithThreeAxesTest(Compute::CpuAcc);
+}
+#endif
+
+#if defined(ARMCOMPUTECL_ENABLED)
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesKeepDimsGpuAccTest)
+{
+ ReduceSumWithTwoAxesKeepDimsTest(Compute::GpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesGpuAccTest)
+{
+ ReduceSumWithTwoAxesTest(Compute::GpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesKeepDimsGpuAccTest)
+{
+ ReduceSumWithThreeAxesKeepDimsTest(Compute::GpuAcc);
+}
+
+BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesGpuAccTest)
+{
+ ReduceSumWithThreeAxesTest(Compute::GpuAcc);
+}
+#endif
+
+BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file
diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
index a0fca46330..9439ddb61e 100644
--- a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
@@ -6,6 +6,9 @@
#pragma once
#include <armnn/backends/OptimizationViews.hpp>
+#include <armnn/utility/Assert.hpp>
+
+#include <aclCommon/ArmComputeUtils.hpp>
namespace armnn
{
@@ -147,4 +150,86 @@ LayerType* FuseLayerWithWeightsAndBiases(OptimizationViews& optimizationViews,
return replacementLayer;
}
+//
+// If reduce layer has multiple axes, add new layer for each axis to simulate the same behaviour
+// as currently only one axis is supported.
+//
+template<typename LayerType>
+void ChainReduceLayers(OptimizationViews& optimizationViews,
+ LayerType* baseLayer,
+ ReduceDescriptor& reduceDescriptor)
+{
+ // If layer has single axis don't chain layers.
+ if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1)
+ {
+ // Save base layer output shape to compare against the output of the final layer added.
+ const TensorInfo baseLayerInfo = baseLayer->GetOutputSlot(0).GetTensorInfo();
+
+ // Vector of new chained layers, used for substitution.
+ std::vector<Layer*> layers;
+
+ // Vector of axes so each layer is reshaped correctly.
+ std::vector<uint32_t> reduceAxis;
+ unsigned int recalulateAxis = 0;
+
+ for (unsigned int i = 0; i != reduceDescriptor.m_vAxis.size(); ++i)
+ {
+ // Get TensorInfo to populate subsequent layers with.
+ TensorInfo layerInfoToModify = baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+
+ reduceAxis.emplace_back(reduceDescriptor.m_vAxis[i]);
+
+ // Calculate new shape based on the axes.
+ const TensorShape& reducedShape = ComputeReductionTensorShape(layerInfoToModify,
+ reduceAxis,
+ reduceDescriptor.m_KeepDims);
+ layerInfoToModify.SetShape(reducedShape);
+
+ // Create a vector for the single axis to be assigned to the descriptor.
+ // Update axis if keepDims is set reduce layers correctly.
+ std::vector<uint32_t> singleAxis(1, reduceDescriptor.m_vAxis[i] - recalulateAxis);
+
+ // Create a descriptor and assign single axis.
+ ReduceDescriptor newReduceDescriptor = baseLayer->GetParameters();
+ newReduceDescriptor.m_vAxis.assign(singleAxis.begin(), singleAxis.end());
+
+ // Add new layer to graph.
+ std::string layerName = "reduce_layer_" + std::to_string(i);
+ Layer* replacementLayer = optimizationViews.GetGraph().AddLayer<LayerType>(newReduceDescriptor,
+ layerName.c_str());
+
+ // Connect previous layer with new layer.
+ // The first and last layer will be connected when the subgraph is replaced.
+ if (!layers.empty())
+ {
+ layers[i - 1]->GetOutputSlot(0).Connect(replacementLayer->GetInputSlot(0));
+ }
+
+ // Set updated tensorInfo for new layer.
+ replacementLayer->GetOutputSlot(0).SetTensorInfo(layerInfoToModify);
+
+ if (!reduceDescriptor.m_KeepDims)
+ {
+ recalulateAxis++;
+ }
+
+ layers.emplace_back(replacementLayer);
+ }
+
+ // Check if the TensorInfo from the last layer equals the inferred output from the original layer.
+ ARMNN_ASSERT(baseLayerInfo == layers.back()->GetOutputSlot().GetTensorInfo());
+
+ std::list<Layer*> replacementLayers(layers.begin(), layers.end());
+
+ // Substitute new chained subgraph for original reduce layer.
+ SubgraphView substitutionSubgraph(baseLayer);
+ SubgraphView replacementSubgraph(CreateInputsFrom({replacementLayers.front()}),
+ CreateOutputsFrom({replacementLayers.back()}),
+ std::move(replacementLayers));
+
+ optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph});
+
+ }
+}
+
} // namespace armnn
diff --git a/src/backends/aclCommon/ArmComputeUtils.hpp b/src/backends/aclCommon/ArmComputeUtils.hpp
index d9efab288f..5bc5abcb05 100644
--- a/src/backends/aclCommon/ArmComputeUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeUtils.hpp
@@ -7,6 +7,7 @@
#include <armnn/Descriptors.hpp>
#include <armnn/Tensor.hpp>
#include <armnn/utility/Assert.hpp>
+#include <armnn/utility/NumericCast.hpp>
#include <backendsCommon/WorkloadData.hpp>
#include <arm_compute/core/Types.h>
@@ -267,4 +268,58 @@ inline arm_compute::ReductionOperation ConvertReductionOperationToAcl(const Redu
}
}
+/// Function to compute the output tensor shape based on the axes and if keepDims is set.
+inline const TensorShape ComputeReductionTensorShape(const armnn::TensorInfo& input,
+ const std::vector<uint32_t>& vAxis,
+ const bool keepDims)
+{
+ unsigned int rank = input.GetNumDimensions();
+ unsigned int outputRank = 0;
+
+ // Calculate output dimension
+ if (keepDims)
+ {
+ outputRank = rank;
+ }
+ else if (vAxis.empty())
+ {
+ outputRank = 1;
+ }
+ else if (vAxis.size() > input.GetNumDimensions())
+ {
+ throw LayerValidationException("ReduceLayer: Dimensions to reduce can not be bigger than input dimensions");
+ }
+ else
+ {
+ outputRank = input.GetNumDimensions() - armnn::numeric_cast<unsigned int>(vAxis.size());
+ if (outputRank == 0)
+ {
+ outputRank = 1;
+ }
+ }
+
+ std::vector<unsigned int> dimSizes(outputRank, 1);
+ if (!vAxis.empty())
+ {
+ // Skip the dimension that has been reduced unless keepDims is true.
+ unsigned int outputIndex = 0;
+ for (unsigned int i = 0; i < input.GetNumDimensions(); ++i)
+ {
+ if (std::find(vAxis.begin(), vAxis.end(), i) == vAxis.end())
+ {
+ dimSizes[outputIndex] = armnn::numeric_cast<unsigned int>(input.GetShape()[i]);
+ ++outputIndex;
+ }
+ else if (keepDims)
+ {
+ dimSizes[outputIndex] = 1;
+ ++outputIndex;
+ }
+ }
+ }
+
+ const TensorShape inferredShape = TensorShape(outputRank, dimSizes.data());
+ return inferredShape;
+}
+
} // namespace armnn
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index f97cb4bba8..92a06aa8e1 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -29,6 +29,7 @@
#include "workloads/ClDivisionWorkload.hpp"
#include "workloads/ClFullyConnectedWorkload.hpp"
#include "workloads/ClMultiplicationWorkload.hpp"
+#include "workloads/ClReduceWorkload.hpp"
#include "workloads/ClSubtractionWorkload.hpp"
#include <Optimizer.hpp>
@@ -188,7 +189,8 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
|| base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
|| base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
- || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division)
+ || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division
+ || base.GetType() == LayerType::Reduce)
&& (base.GetAdditionalInformation<ActivationDescriptor>() == nullptr))
{
for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output)
@@ -412,6 +414,26 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
}
}
}
+
+ // Separate check for Reduce as we aren't fusing with activation layer
+ if (base.GetType() == LayerType::Reduce)
+ {
+ ReduceLayer* baseLayer = PolymorphicDowncast<ReduceLayer*>(&base);
+
+ // Get params from base layer
+ ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
+
+ arm_compute::Status status = ClReduceWorkloadValidate(
+ baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
+ baseLayer->GetOutputSlot(0).GetTensorInfo(),
+ reduceDescriptor);
+
+ if (status)
+ {
+ ChainReduceLayers<ReduceLayer>(optimizationViews, baseLayer, reduceDescriptor);
+ untouched.erase(baseLayer->GetGuid());
+ }
+ }
}
}
}
diff --git a/src/backends/cl/workloads/ClReduceWorkload.cpp b/src/backends/cl/workloads/ClReduceWorkload.cpp
index 6f594ff7a9..0ad6259cc2 100644
--- a/src/backends/cl/workloads/ClReduceWorkload.cpp
+++ b/src/backends/cl/workloads/ClReduceWorkload.cpp
@@ -20,23 +20,52 @@ arm_compute::Status ClReduceWorkloadValidate(const TensorInfo& input,
const ReduceDescriptor& desc)
{
const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
- const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
- if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
- {
- return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
- "ClReduceWorkload: Reduction is supported only on 1 axis.");
- }
arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
input.GetNumDimensions(),
desc.m_vAxis);
+ // As ACL only support one axis, validate the layer for each axis if more than one is present.
+ if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+ {
+ arm_compute::Status status;
+
+ for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i)
+ {
+ TensorInfo inputToModify = input;
+ std::vector<uint32_t> singleAxis(1, desc.m_vAxis[i]);
- return arm_compute::CLReductionOperation::validate(&aclInputInfo,
- &aclOutputInfo,
- static_cast<unsigned int>(coords[0]),
- ConvertReductionOperationToAcl(desc),
- desc.m_KeepDims);
+ // Calculate the output shape using the input shape for a single axis.
+ // Currently the output TensorInfo inferred will be reduced upon multiple axis
+ // which will fail validation as only one axis is supported.
+ const TensorShape& reducedShape = ComputeReductionTensorShape(inputToModify, singleAxis, desc.m_KeepDims);
+ inputToModify.SetShape(reducedShape);
+
+ const arm_compute::TensorInfo aclOutputInfoModified =
+ armcomputetensorutils::BuildArmComputeTensorInfo(inputToModify);
+
+ status = arm_compute::CLReductionOperation::validate(&aclInputInfo,
+ &aclOutputInfoModified,
+ static_cast<unsigned int>(coords[i]),
+ ConvertReductionOperationToAcl(desc),
+ desc.m_KeepDims);
+ if (!status)
+ {
+ break;
+ }
+ }
+ return status;
+ }
+ else
+ {
+ const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::CLReductionOperation::validate(&aclInputInfo,
+ &aclOutputInfo,
+ static_cast<unsigned int>(coords[0]),
+ ConvertReductionOperationToAcl(desc),
+ desc.m_KeepDims);
+ }
}
ClReduceWorkload::ClReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp
index a1299fb458..6d5eab0ddf 100644
--- a/src/backends/neon/NeonBackend.cpp
+++ b/src/backends/neon/NeonBackend.cpp
@@ -29,6 +29,7 @@
#include "workloads/NeonDivisionWorkload.hpp"
#include "workloads/NeonFullyConnectedWorkload.hpp"
#include "workloads/NeonMultiplicationWorkload.hpp"
+#include "workloads/NeonReduceWorkload.hpp"
#include "workloads/NeonSubtractionWorkload.hpp"
#include <Optimizer.hpp>
@@ -164,7 +165,8 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph
if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
|| base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
|| base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
- || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division)
+ || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division
+ || base.GetType() == LayerType::Reduce)
&& (base.GetAdditionalInformation<ActivationDescriptor>() == nullptr))
{
for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output)
@@ -389,6 +391,26 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph
}
}
}
+
+ // Separate check for Reduce as we aren't fusing with activation layer
+ if (base.GetType() == LayerType::Reduce)
+ {
+ ReduceLayer* baseLayer = PolymorphicDowncast<ReduceLayer*>(&base);
+
+ // Get params from base layer
+ ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
+
+ arm_compute::Status status = NeonReduceWorkloadValidate(
+ baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
+ baseLayer->GetOutputSlot(0).GetTensorInfo(),
+ reduceDescriptor);
+
+ if (status)
+ {
+ ChainReduceLayers<ReduceLayer>(optimizationViews, baseLayer, reduceDescriptor);
+ untouched.erase(baseLayer->GetGuid());
+ }
+ }
}
}
}
diff --git a/src/backends/neon/workloads/NeonReduceWorkload.cpp b/src/backends/neon/workloads/NeonReduceWorkload.cpp
index 0e1b46a3a1..6125f3609d 100644
--- a/src/backends/neon/workloads/NeonReduceWorkload.cpp
+++ b/src/backends/neon/workloads/NeonReduceWorkload.cpp
@@ -21,22 +21,52 @@ arm_compute::Status NeonReduceWorkloadValidate(const TensorInfo& input,
const ReduceDescriptor& desc)
{
const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
- const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
- if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
- {
- return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
- "NeonReduceWorkload: Reduction is supported only on 1 axis.");
- }
arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
input.GetNumDimensions(),
desc.m_vAxis);
- return arm_compute::NEReductionOperation::validate(&aclInputInfo,
- &aclOutputInfo,
- static_cast<unsigned int>(coords[0]),
- ConvertReductionOperationToAcl(desc),
- desc.m_KeepDims);
+ // As ACL only support one axis, validate the layer for each axis if more than one is present.
+ if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+ {
+ arm_compute::Status status;
+
+ for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i)
+ {
+ TensorInfo inputToModify = input;
+ std::vector<uint32_t> singleAxis(1, desc.m_vAxis[i]);
+
+ // Calculate the output shape using the input shape for a single axis.
+ // Currently the output TensorInfo inferred will be reduced upon multiple axis
+ // which will fail validation as only one axis is supported.
+ const TensorShape& reducedShape = ComputeReductionTensorShape(inputToModify, singleAxis, desc.m_KeepDims);
+ inputToModify.SetShape(reducedShape);
+
+ const arm_compute::TensorInfo aclOutputInfoModified =
+ armcomputetensorutils::BuildArmComputeTensorInfo(inputToModify);
+
+ status = arm_compute::NEReductionOperation::validate(&aclInputInfo,
+ &aclOutputInfoModified,
+ static_cast<unsigned int>(coords[i]),
+ ConvertReductionOperationToAcl(desc),
+ desc.m_KeepDims);
+ if (!status)
+ {
+ break;
+ }
+ }
+ return status;
+ }
+ else
+ {
+ const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::NEReductionOperation::validate(&aclInputInfo,
+ &aclOutputInfo,
+ static_cast<unsigned int>(coords[0]),
+ ConvertReductionOperationToAcl(desc),
+ desc.m_KeepDims);
+ }
}
NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
@@ -50,6 +80,7 @@ NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor,
arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(input.info()->num_dimensions(),
info.m_InputTensorInfos[0].GetNumDimensions(),
m_Data.m_Parameters.m_vAxis);
+
m_Layer.configure(&input,
&output,
static_cast<unsigned int>(coords[0]),