From 5fc0fd6661f9647092deb052d052973a237bd52d Mon Sep 17 00:00:00 2001 From: Matthew Sloyan Date: Mon, 3 May 2021 12:22:03 +0100 Subject: MLCE-418 Reduce layer does not support multiple axes * Added backend specific optimization to chain new reduces layers for each axis to simulate behaviour of a layer with multiple axes. * Added function to calculate reduced output shape. * Added unit tests. * Includes rework to fix IVGCVSW-5987. Signed-off-by: Matthew Sloyan Change-Id: I154b3698b5e6756b05b2a0b5a3f0896184efce72 --- Android.mk | 1 + CMakeLists.txt | 1 + .../test/optimizations/ReduceMultipleAxesTests.cpp | 293 +++++++++++++++++++++ src/backends/aclCommon/ArmComputeSubgraphUtils.hpp | 84 ++++++ src/backends/aclCommon/ArmComputeUtils.hpp | 94 +++++++ src/backends/cl/ClBackend.cpp | 21 ++ src/backends/cl/workloads/ClReduceWorkload.cpp | 34 +-- src/backends/neon/NeonBackend.cpp | 21 ++ src/backends/neon/workloads/NeonReduceWorkload.cpp | 34 ++- 9 files changed, 554 insertions(+), 29 deletions(-) create mode 100644 src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp diff --git a/Android.mk b/Android.mk index d9230e5585..168b32a400 100644 --- a/Android.mk +++ b/Android.mk @@ -393,6 +393,7 @@ LOCAL_SRC_FILES := \ src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp \ src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp \ src/armnn/test/optimizations/PermuteAsReshapeTests.cpp \ + src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp \ src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp \ src/armnn/test/optimizations/TransposeAsReshapeTests.cpp \ src/armnn/test/OptimizerTests.cpp \ diff --git a/CMakeLists.txt b/CMakeLists.txt index dfdff89bbe..1e201ea0ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -566,6 +566,7 @@ if(BUILD_UNIT_TESTS) src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp src/armnn/test/optimizations/PermuteAsReshapeTests.cpp + src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp src/armnn/test/optimizations/TransposeAsReshapeTests.cpp src/armnn/test/OptionalTest.cpp diff --git a/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp b/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp new file mode 100644 index 0000000000..b42c0a2cfb --- /dev/null +++ b/src/armnn/test/optimizations/ReduceMultipleAxesTests.cpp @@ -0,0 +1,293 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "../GraphUtils.hpp" +#include "../TestUtils.hpp" + +#include + +#include + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(Optimizer) + +INetworkPtr CreateSimpleReduceNetwork(ReduceDescriptor reduceDescriptor, + TensorShape& inputShape, + TensorShape& outputShape) +{ + // Create a network + INetworkPtr network = INetwork::Create(); + + const std::string layerName("reduce_layer"); + const TensorInfo inputInfo (inputShape, DataType::Float32); + const TensorInfo outputInfo(outputShape, DataType::Float32); + + IConnectableLayer* const inputLayer = network->AddInputLayer(0); + IConnectableLayer* const reduceLayer = network->AddReduceLayer(reduceDescriptor, layerName.c_str()); + IConnectableLayer* const outputLayer1 = network->AddOutputLayer(0); + IConnectableLayer* const outputLayer2 = network->AddOutputLayer(1); + + inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); + reduceLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); + + inputLayer->GetOutputSlot(0).Connect(reduceLayer->GetInputSlot(0)); + reduceLayer->GetOutputSlot(0).Connect(outputLayer1->GetInputSlot(0)); + reduceLayer->GetOutputSlot(0).Connect(outputLayer2->GetInputSlot(0)); + + return network; +} + +void ReduceWithMultipleAxesTest(INetworkPtr& network, + const TensorShape& outputShape, + const std::vector& inputData, + const std::vector& expectedOutput, + const size_t numOfAxes, + Compute backendId) +{ + // Create ArmNN runtime + IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions()); + + // Optimise ArmNN network + IOptimizedNetworkPtr optNet = Optimize(*network, {backendId}, run->GetDeviceSpec()); + + Graph& graph = GetGraphForTesting(optNet.get()); + if (numOfAxes == 2) + { + BOOST_CHECK(graph.GetNumLayers() == 5); + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); + } + else + { + BOOST_CHECK(graph.GetNumLayers() == 6); + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); + } + + // Get last layer in new chain, layers name follow 0, 1, 2 pattern + std::string layerName = "reduce_layer_" + std::to_string(numOfAxes - 1); + Layer* const reduceLayer = GetFirstLayerWithName(graph, layerName); + BOOST_TEST(reduceLayer); + auto reduceTensorInfo = reduceLayer->GetOutputSlot().GetTensorInfo(); + + // Tensorshape and the data type are correct + BOOST_TEST((reduceTensorInfo.GetShape() == outputShape)); + BOOST_TEST((reduceTensorInfo.GetDataType() == DataType::Float32)); + + // Load network into runtime + NetworkId networkIdentifier; + run->LoadNetwork(networkIdentifier, std::move(optNet)); + + // Create input and output tensors + std::vector outputData(expectedOutput.size()); + InputTensors inputTensors + { + { 0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), inputData.data()) } + }; + OutputTensors outputTensors + { + { 0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData.data()) }, + { 1, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 1), outputData.data()) } + }; + + // Run inference + run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors); + + // Checks the results + BOOST_TEST(outputData == expectedOutput); +} + +void ReduceSumWithTwoAxesKeepDimsTest(Compute backendId) +{ + armnn::ReduceDescriptor reduceDescriptor; + reduceDescriptor.m_vAxis = { 1, 2 }; + reduceDescriptor.m_KeepDims = true; + reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum; + + TensorShape inputShape = { 1, 3, 2, 4 }; + TensorShape outputShape = { 1, 1, 1, 4 }; + + // Construct ArmNN network + INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape); + + // Creates structures for input & output. + const std::vector inputData({ 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + + 10.0f, 20.0f, 30.0f, 40.0f, + 50.0f, 60.0f, 70.0f, 80.0f, + + 100.0f, 200.0f, 300.0f, 400.0f, + 500.0f, 600.0f, 700.0f, 800.0f }); + const std::vector expectedOutput({ 666.0f, 888.0f, 1110.0f, 1332.0f }); + + ReduceWithMultipleAxesTest(network, + outputShape, + inputData, + expectedOutput, + reduceDescriptor.m_vAxis.size(), + backendId); +} + +void ReduceSumWithTwoAxesTest(Compute backendId) +{ + armnn::ReduceDescriptor reduceDescriptor; + reduceDescriptor.m_vAxis = { 1, 2 }; + reduceDescriptor.m_KeepDims = false; + reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum; + + TensorShape inputShape = { 1, 3, 2, 4 }; + TensorShape outputShape = { 1, 4 }; + + // Construct ArmNN network + INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape); + + // Creates structures for input & output. + const std::vector inputData({ 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + + 10.0f, 20.0f, 30.0f, 40.0f, + 50.0f, 60.0f, 70.0f, 80.0f, + + 100.0f, 200.0f, 300.0f, 400.0f, + 500.0f, 600.0f, 700.0f, 800.0f }); + const std::vector expectedOutput({ 666.0f, 888.0f, 1110.0f, 1332.0f }); + + ReduceWithMultipleAxesTest(network, + outputShape, + inputData, + expectedOutput, + reduceDescriptor.m_vAxis.size(), + backendId); +} + +void ReduceSumWithThreeAxesKeepDimsTest(Compute backendId) +{ + armnn::ReduceDescriptor reduceDescriptor; + reduceDescriptor.m_vAxis = { 0, 2, 3 }; + reduceDescriptor.m_KeepDims = true; + reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum; + + TensorShape inputShape = { 2, 2, 2, 2 }; + TensorShape outputShape = { 1, 2, 1, 1 }; + + // Construct ArmNN network + INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape); + + // Creates structures for input & output. + const std::vector inputData({ 1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 10.0f, 20.0f, + 30.0f, 40.0f, + + 50.0f, 60.0f, + 70.0f, 80.0f }); + const std::vector expectedOutput({ 110.0f, 286.0f }); + + ReduceWithMultipleAxesTest(network, + outputShape, + inputData, + expectedOutput, + reduceDescriptor.m_vAxis.size(), + backendId); +} + +void ReduceSumWithThreeAxesTest(Compute backendId) +{ + armnn::ReduceDescriptor reduceDescriptor; + reduceDescriptor.m_vAxis = { 0, 2, 3 }; + reduceDescriptor.m_KeepDims = false; + reduceDescriptor.m_ReduceOperation = armnn::ReduceOperation::Sum; + + TensorShape inputShape = { 2, 2, 2, 2 }; + TensorShape outputShape = { 2 }; + + // Construct ArmNN network + INetworkPtr network = CreateSimpleReduceNetwork(reduceDescriptor, inputShape, outputShape); + + // Creates structures for input & output. + const std::vector inputData({ 1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 10.0f, 20.0f, + 30.0f, 40.0f, + + 50.0f, 60.0f, + 70.0f, 80.0f }); + const std::vector expectedOutput({ 110.0f, 286.0f }); + + ReduceWithMultipleAxesTest(network, + outputShape, + inputData, + expectedOutput, + reduceDescriptor.m_vAxis.size(), + backendId); +} + +using namespace armnn; +#if defined(ARMCOMPUTENEON_ENABLED) +BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesKeepDimsCpuAccTest) +{ + ReduceSumWithTwoAxesKeepDimsTest(Compute::CpuAcc); +} + +BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesCpuAccTest) +{ + ReduceSumWithTwoAxesTest(Compute::CpuAcc); +} + +BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesKeepDimsCpuAccTest) +{ + ReduceSumWithThreeAxesKeepDimsTest(Compute::CpuAcc); +} + +BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesCpuAccTest) +{ + ReduceSumWithThreeAxesTest(Compute::CpuAcc); +} +#endif + +#if defined(ARMCOMPUTECL_ENABLED) +BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesKeepDimsGpuAccTest) +{ + ReduceSumWithTwoAxesKeepDimsTest(Compute::GpuAcc); +} + +BOOST_AUTO_TEST_CASE(ReduceSumWithTwoAxesGpuAccTest) +{ + ReduceSumWithTwoAxesTest(Compute::GpuAcc); +} + +BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesKeepDimsGpuAccTest) +{ + ReduceSumWithThreeAxesKeepDimsTest(Compute::GpuAcc); +} + +BOOST_AUTO_TEST_CASE(ReduceSumWithThreeAxesGpuAccTest) +{ + ReduceSumWithThreeAxesTest(Compute::GpuAcc); +} +#endif + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp index a0fca46330..521c17cd62 100644 --- a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp +++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp @@ -6,6 +6,9 @@ #pragma once #include +#include + +#include namespace armnn { @@ -147,4 +150,85 @@ LayerType* FuseLayerWithWeightsAndBiases(OptimizationViews& optimizationViews, return replacementLayer; } +// +// If reduce layer has multiple axes, add new layer for each axis to simulate the same behaviour +// as currently only one axis is supported. +// +template +std::vector ChainReduceLayers(OptimizationViews& optimizationViews, + LayerType* baseLayer, + ReduceDescriptor& desc) +{ + // Vector of new chained layers, used for substitution. + std::vector layers; + + // Vector of axes so each layer is reshaped correctly. + std::vector axes; + unsigned int recalulatedAxis = 0; + + for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i) + { + // Get TensorInfo from base layer and reduce shape using axis. + TensorInfo layerInfo = baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); + + axes.emplace_back(desc.m_vAxis[i]); + + const TensorInfo& reducedTensorInfo = ComputeReductionTensorShape(layerInfo, + axes, + desc.m_KeepDims); + + // Create a vector for the single axis to be assigned to the descriptor. + // Update axis if keepDims is set reduce layers correctly. + std::vector singleAxis(1, desc.m_vAxis[i] - recalulatedAxis); + + // Create a descriptor and assign single axis. + ReduceDescriptor newReduceDescriptor = baseLayer->GetParameters(); + newReduceDescriptor.m_vAxis.assign(singleAxis.begin(), singleAxis.end()); + + // Add new layer to graph. + std::string layerName = "reduce_layer_" + std::to_string(i); + Layer* replacementLayer = optimizationViews.GetGraph().AddLayer(newReduceDescriptor, + layerName.c_str()); + // Connect previous layer with new layer. + // The first and last layer will be connected when the subgraph is replaced. + if (!layers.empty()) + { + layers[i - 1]->GetOutputSlot(0).Connect(replacementLayer->GetInputSlot(0)); + } + + // Set updated tensorInfo for new layer. + replacementLayer->GetOutputSlot(0).SetTensorInfo(reducedTensorInfo); + + if (!desc.m_KeepDims) + { + recalulatedAxis++; + } + + layers.emplace_back(replacementLayer); + } + + // Check if the TensorInfo from the last layer equals the inferred output from the original layer. + ARMNN_ASSERT(baseLayer->GetOutputSlot(0).GetTensorInfo() == layers.back()->GetOutputSlot().GetTensorInfo()); + + return layers; +} + +// +// Substitute baseLayer with new subgraph +// +template +void ReplaceLayers(OptimizationViews& optimizationViews, + LayerType* baseLayer, + std::vector& layers) +{ + std::list replacementLayers(layers.begin(), layers.end()); + + SubgraphView substitutionSubgraph(baseLayer); + SubgraphView replacementSubgraph(CreateInputsFrom({replacementLayers.front()}), + CreateOutputsFrom({replacementLayers.back()}), + std::move(replacementLayers)); + + optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph}); +} + } // namespace armnn diff --git a/src/backends/aclCommon/ArmComputeUtils.hpp b/src/backends/aclCommon/ArmComputeUtils.hpp index d9efab288f..624ce5df7a 100644 --- a/src/backends/aclCommon/ArmComputeUtils.hpp +++ b/src/backends/aclCommon/ArmComputeUtils.hpp @@ -7,10 +7,19 @@ #include #include #include +#include #include #include +#if defined(ARMCOMPUTENEON_ENABLED) +#include "neon/workloads/NeonReduceWorkload.hpp" +#endif + +#if defined(ARMCOMPUTECL_ENABLED) +#include "cl/workloads/ClReduceWorkload.hpp" +#endif + namespace armnn { @@ -267,4 +276,89 @@ inline arm_compute::ReductionOperation ConvertReductionOperationToAcl(const Redu } } +/// Function to compute the output tensor shape based on the axes and if keepDims is set. +inline const TensorInfo ComputeReductionTensorShape(const armnn::TensorInfo& input, + const std::vector& vAxis, + const bool keepDims) +{ + auto reducedTensorInfo = input; + unsigned int rank = reducedTensorInfo.GetNumDimensions(); + unsigned int outputRank = 0; + // Calculate output dimension + if (keepDims) + { + outputRank = rank; + } + else if (vAxis.empty()) + { + outputRank = 1; + } + else if (vAxis.size() > reducedTensorInfo.GetNumDimensions()) + { + throw LayerValidationException("ReduceLayer: Dimensions to reduce can not be bigger than input dimensions"); + } + else + { + outputRank = reducedTensorInfo.GetNumDimensions() - armnn::numeric_cast(vAxis.size()); + if (outputRank == 0) + { + outputRank = 1; + } + } + std::vector dimSizes(outputRank, 1); + if (!vAxis.empty()) + { + // Skip the dimension that has been reduced unless keepDims is true. + unsigned int outputIndex = 0; + for (unsigned int i = 0; i < reducedTensorInfo.GetNumDimensions(); ++i) + { + if (std::find(vAxis.begin(), vAxis.end(), i) == vAxis.end()) + { + dimSizes[outputIndex] = armnn::numeric_cast(reducedTensorInfo.GetShape()[i]); + ++outputIndex; + } + else if (keepDims) + { + dimSizes[outputIndex] = 1; + ++outputIndex; + } + } + } + const TensorShape inferredShape = TensorShape(outputRank, dimSizes.data()); + reducedTensorInfo.SetShape(inferredShape); + return reducedTensorInfo; +} + +/// Macro function check if layer with multiple axes is supported on each backend +#define IS_MULTI_AXES_REDUCE_SUPPORTED(func, input, desc, status) \ + armnn::TensorInfo inputTensorInfo = input; \ + unsigned int recalulatedAxis = 0; \ + std::vector axes; \ + \ + for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i) \ + { \ + axes.emplace_back(desc.m_vAxis[i]); \ + \ + const armnn::TensorInfo& reducedTensorInfo = \ + ComputeReductionTensorShape(input, axes, desc.m_KeepDims); \ + \ + std::vector singleAxis(1, desc.m_vAxis[i] - recalulatedAxis); \ + \ + armnn::ReduceDescriptor newReduceDescriptor = desc; \ + newReduceDescriptor.m_vAxis.assign(singleAxis.begin(), singleAxis.end()); \ + \ + status = func(inputTensorInfo, reducedTensorInfo, newReduceDescriptor); \ + if (!status) \ + { \ + break; \ + } \ + \ + if (!desc.m_KeepDims) \ + { \ + recalulatedAxis++; \ + } \ + \ + inputTensorInfo = reducedTensorInfo; \ + } + } // namespace armnn diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index 35770d9219..a9ab237325 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -30,6 +30,7 @@ #include "workloads/ClDivisionWorkload.hpp" #include "workloads/ClFullyConnectedWorkload.hpp" #include "workloads/ClMultiplicationWorkload.hpp" +#include "workloads/ClReduceWorkload.hpp" #include "workloads/ClSubtractionWorkload.hpp" #include @@ -220,6 +221,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, --it; Layer& base = **it; + // Fuse activation into previous layer if supported by backend if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication @@ -451,6 +453,25 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, } } } + + // Separate reduce layer with multiple axes into multiple reduce layers with 1 axis. + if (base.GetType() == LayerType::Reduce) + { + ReduceLayer* baseLayer = PolymorphicDowncast(&base); + ReduceDescriptor reduceDescriptor = baseLayer->GetParameters(); + + if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1) + { + // Add new layers to the graph and connect them. + std::vector layers = ChainReduceLayers(optimizationViews, + baseLayer, + reduceDescriptor); + + // Replace existing baselayer with new subgraph. + ReplaceLayers(optimizationViews, baseLayer, layers); + untouched.erase(baseLayer->GetGuid()); + } + } } if (optimizationViews.GetSubstitutions().empty()) diff --git a/src/backends/cl/workloads/ClReduceWorkload.cpp b/src/backends/cl/workloads/ClReduceWorkload.cpp index 6f594ff7a9..18415c4cba 100644 --- a/src/backends/cl/workloads/ClReduceWorkload.cpp +++ b/src/backends/cl/workloads/ClReduceWorkload.cpp @@ -19,24 +19,28 @@ arm_compute::Status ClReduceWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const ReduceDescriptor& desc) { - const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); - const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); - if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1) + if ( desc.m_vAxis.size()==1 || desc.m_vAxis.empty()) { - return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, - "ClReduceWorkload: Reduction is supported only on 1 axis."); - } - - arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(), - input.GetNumDimensions(), - desc.m_vAxis); + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(), + input.GetNumDimensions(), + desc.m_vAxis); - return arm_compute::CLReductionOperation::validate(&aclInputInfo, - &aclOutputInfo, - static_cast(coords[0]), - ConvertReductionOperationToAcl(desc), - desc.m_KeepDims); + return arm_compute::CLReductionOperation::validate(&aclInputInfo, + &aclOutputInfo, + static_cast(coords[0]), + ConvertReductionOperationToAcl(desc), + desc.m_KeepDims); + } + else + { + // Validate layer if there are multiple axes. + arm_compute::Status status; + IS_MULTI_AXES_REDUCE_SUPPORTED(NeonReduceWorkloadValidate, input, desc, status); + return status; + } } ClReduceWorkload::ClReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info) diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp index a1299fb458..b496238cf3 100644 --- a/src/backends/neon/NeonBackend.cpp +++ b/src/backends/neon/NeonBackend.cpp @@ -29,6 +29,7 @@ #include "workloads/NeonDivisionWorkload.hpp" #include "workloads/NeonFullyConnectedWorkload.hpp" #include "workloads/NeonMultiplicationWorkload.hpp" +#include "workloads/NeonReduceWorkload.hpp" #include "workloads/NeonSubtractionWorkload.hpp" #include @@ -161,6 +162,7 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph --it; Layer& base = **it; + // Fuse activation into previous layer if supported by backend if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication @@ -393,6 +395,25 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph } } } + + // Separate reduce layer with multiple axes into multiple reduce layers with 1 axis. + if (base.GetType() == LayerType::Reduce) + { + ReduceLayer* baseLayer = PolymorphicDowncast(&base); + ReduceDescriptor reduceDescriptor = baseLayer->GetParameters(); + + if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1) + { + // Add new layers to the graph and connect them. + std::vector layers = ChainReduceLayers(optimizationViews, + baseLayer, + reduceDescriptor); + + // Replace existing baselayer with new subgraph. + ReplaceLayers(optimizationViews, baseLayer, layers); + untouched.erase(baseLayer->GetGuid()); + } + } } if (optimizationViews.GetSubstitutions().empty()) diff --git a/src/backends/neon/workloads/NeonReduceWorkload.cpp b/src/backends/neon/workloads/NeonReduceWorkload.cpp index 0e1b46a3a1..1436cd1192 100644 --- a/src/backends/neon/workloads/NeonReduceWorkload.cpp +++ b/src/backends/neon/workloads/NeonReduceWorkload.cpp @@ -20,23 +20,28 @@ arm_compute::Status NeonReduceWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const ReduceDescriptor& desc) { - const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); - const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); - if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1) + if ( desc.m_vAxis.size()==1 || desc.m_vAxis.empty()) { - return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, - "NeonReduceWorkload: Reduction is supported only on 1 axis."); - } + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); - arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(), - input.GetNumDimensions(), - desc.m_vAxis); + arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(), + input.GetNumDimensions(), + desc.m_vAxis); - return arm_compute::NEReductionOperation::validate(&aclInputInfo, - &aclOutputInfo, - static_cast(coords[0]), - ConvertReductionOperationToAcl(desc), - desc.m_KeepDims); + return arm_compute::NEReductionOperation::validate(&aclInputInfo, + &aclOutputInfo, + static_cast(coords[0]), + ConvertReductionOperationToAcl(desc), + desc.m_KeepDims); + } + else + { + // Validate layer if there are multiple axes. + arm_compute::Status status; + IS_MULTI_AXES_REDUCE_SUPPORTED(NeonReduceWorkloadValidate, input, desc, status); + return status; + } } NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info) @@ -50,6 +55,7 @@ NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(input.info()->num_dimensions(), info.m_InputTensorInfos[0].GetNumDimensions(), m_Data.m_Parameters.m_vAxis); + m_Layer.configure(&input, &output, static_cast(coords[0]), -- cgit v1.2.1