aboutsummaryrefslogtreecommitdiff
path: root/src/backends/neon
diff options
context:
space:
mode:
authorDavid Beck <david.beck@arm.com>2018-09-24 15:59:27 +0100
committerMatthew Bentham <matthew.bentham@arm.com>2018-10-10 16:16:57 +0100
commit0dbe0ee25312b728d77383d11c465156e64ae757 (patch)
treeaf37a9802e3ad551e1bf63f7636508cde7a41643 /src/backends/neon
parentb4540bef0b0327683fe8e63f727c1212800dc2a9 (diff)
downloadarmnn-0dbe0ee25312b728d77383d11c465156e64ae757.tar.gz
IVGCVSW-1899 : Neon backend folder structure
armnn:149855 Change-Id: I26e8cf83422a65049386a5ebdb6d0001627aefaa
Diffstat (limited to 'src/backends/neon')
-rw-r--r--src/backends/neon/CMakeLists.txt28
-rw-r--r--src/backends/neon/NeonLayerSupport.cpp468
-rw-r--r--src/backends/neon/NeonLayerSupport.hpp163
-rw-r--r--src/backends/neon/NeonTensorHandle.hpp142
-rw-r--r--src/backends/neon/NeonWorkloadFactory.cpp494
-rw-r--r--src/backends/neon/NeonWorkloadFactory.hpp138
-rw-r--r--src/backends/neon/backend.cmake13
-rw-r--r--src/backends/neon/backend.mk47
-rw-r--r--src/backends/neon/test/CMakeLists.txt4
-rw-r--r--src/backends/neon/workloads/CMakeLists.txt86
-rw-r--r--src/backends/neon/workloads/NeonActivationFloatWorkload.cpp57
-rw-r--r--src/backends/neon/workloads/NeonActivationFloatWorkload.hpp29
-rw-r--r--src/backends/neon/workloads/NeonActivationUint8Workload.cpp35
-rw-r--r--src/backends/neon/workloads/NeonActivationUint8Workload.hpp28
-rw-r--r--src/backends/neon/workloads/NeonAdditionFloatWorkload.cpp48
-rw-r--r--src/backends/neon/workloads/NeonAdditionFloatWorkload.hpp30
-rw-r--r--src/backends/neon/workloads/NeonBaseConstantWorkload.hpp82
-rw-r--r--src/backends/neon/workloads/NeonBaseMergerWorkload.hpp26
-rw-r--r--src/backends/neon/workloads/NeonBaseSplitterWorkload.hpp27
-rw-r--r--src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.cpp96
-rw-r--r--src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.hpp42
-rw-r--r--src/backends/neon/workloads/NeonConstantFloatWorkload.cpp17
-rw-r--r--src/backends/neon/workloads/NeonConstantFloatWorkload.hpp20
-rw-r--r--src/backends/neon/workloads/NeonConstantUint8Workload.cpp17
-rw-r--r--src/backends/neon/workloads/NeonConstantUint8Workload.hpp20
-rw-r--r--src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp41
-rw-r--r--src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp26
-rw-r--r--src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp43
-rw-r--r--src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp26
-rw-r--r--src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp146
-rw-r--r--src/backends/neon/workloads/NeonConvolution2dBaseWorkload.hpp49
-rw-r--r--src/backends/neon/workloads/NeonConvolution2dFloatWorkload.cpp40
-rw-r--r--src/backends/neon/workloads/NeonConvolution2dFloatWorkload.hpp29
-rw-r--r--src/backends/neon/workloads/NeonConvolution2dUint8Workload.cpp35
-rw-r--r--src/backends/neon/workloads/NeonConvolution2dUint8Workload.hpp29
-rw-r--r--src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.cpp49
-rw-r--r--src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.hpp21
-rw-r--r--src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp93
-rw-r--r--src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.hpp33
-rw-r--r--src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp93
-rw-r--r--src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.hpp29
-rw-r--r--src/backends/neon/workloads/NeonFloorFloatWorkload.cpp30
-rw-r--r--src/backends/neon/workloads/NeonFloorFloatWorkload.hpp27
-rw-r--r--src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp110
-rw-r--r--src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp40
-rw-r--r--src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp42
-rw-r--r--src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp32
-rw-r--r--src/backends/neon/workloads/NeonLstmFloatWorkload.cpp22
-rw-r--r--src/backends/neon/workloads/NeonLstmFloatWorkload.hpp20
-rw-r--r--src/backends/neon/workloads/NeonMergerFloatWorkload.cpp17
-rw-r--r--src/backends/neon/workloads/NeonMergerFloatWorkload.hpp20
-rw-r--r--src/backends/neon/workloads/NeonMergerUint8Workload.cpp17
-rw-r--r--src/backends/neon/workloads/NeonMergerUint8Workload.hpp20
-rw-r--r--src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp60
-rw-r--r--src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp30
-rw-r--r--src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp70
-rw-r--r--src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp33
-rw-r--r--src/backends/neon/workloads/NeonPermuteWorkload.cpp54
-rw-r--r--src/backends/neon/workloads/NeonPermuteWorkload.hpp43
-rw-r--r--src/backends/neon/workloads/NeonPooling2dBaseWorkload.cpp47
-rw-r--r--src/backends/neon/workloads/NeonPooling2dBaseWorkload.hpp37
-rw-r--r--src/backends/neon/workloads/NeonPooling2dFloatWorkload.cpp27
-rw-r--r--src/backends/neon/workloads/NeonPooling2dFloatWorkload.hpp25
-rw-r--r--src/backends/neon/workloads/NeonPooling2dUint8Workload.cpp26
-rw-r--r--src/backends/neon/workloads/NeonPooling2dUint8Workload.hpp25
-rw-r--r--src/backends/neon/workloads/NeonReshapeFloatWorkload.cpp32
-rw-r--r--src/backends/neon/workloads/NeonReshapeFloatWorkload.hpp29
-rw-r--r--src/backends/neon/workloads/NeonReshapeUint8Workload.cpp30
-rw-r--r--src/backends/neon/workloads/NeonReshapeUint8Workload.hpp27
-rw-r--r--src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp30
-rw-r--r--src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp17
-rw-r--r--src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp32
-rw-r--r--src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp28
-rw-r--r--src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp41
-rw-r--r--src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp26
-rw-r--r--src/backends/neon/workloads/NeonSplitterFloatWorkload.cpp17
-rw-r--r--src/backends/neon/workloads/NeonSplitterFloatWorkload.hpp20
-rw-r--r--src/backends/neon/workloads/NeonSplitterUint8Workload.cpp17
-rw-r--r--src/backends/neon/workloads/NeonSplitterUint8Workload.hpp20
-rw-r--r--src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp46
-rw-r--r--src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp27
-rw-r--r--src/backends/neon/workloads/NeonWorkloadUtils.cpp60
-rw-r--r--src/backends/neon/workloads/NeonWorkloadUtils.hpp34
-rw-r--r--src/backends/neon/workloads/NeonWorkloads.hpp41
84 files changed, 4387 insertions, 0 deletions
diff --git a/src/backends/neon/CMakeLists.txt b/src/backends/neon/CMakeLists.txt
new file mode 100644
index 0000000000..c748825464
--- /dev/null
+++ b/src/backends/neon/CMakeLists.txt
@@ -0,0 +1,28 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+if(ARMCOMPUTENEON)
+ list(APPEND armnnNeonBackend_sources
+ NeonLayerSupport.cpp
+ NeonLayerSupport.hpp
+ NeonWorkloadFactory.cpp
+ NeonWorkloadFactory.hpp
+ NeonTensorHandle.hpp
+ )
+
+ add_subdirectory(workloads test)
+else()
+ list(APPEND armnnNeonBackend_sources
+ NeonLayerSupport.cpp
+ NeonLayerSupport.hpp
+ NeonWorkloadFactory.cpp
+ NeonWorkloadFactory.hpp
+ )
+endif()
+
+add_library(armnnNeonBackend STATIC ${armnnNeonBackend_sources})
+target_include_directories(armnnNeonBackend PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(armnnNeonBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnNeonBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
new file mode 100644
index 0000000000..dfaea5c81c
--- /dev/null
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -0,0 +1,468 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonLayerSupport.hpp"
+
+#include <LayerSupportCommon.hpp>
+#include <InternalTypes.hpp>
+
+#include <armnn/Descriptors.hpp>
+#include <armnn/Types.hpp>
+#include <armnn/Tensor.hpp>
+
+#include <boost/core/ignore_unused.hpp>
+
+#ifdef ARMCOMPUTENEON_ENABLED
+#include "workloads/NeonAdditionFloatWorkload.hpp"
+#include "workloads/NeonActivationFloatWorkload.hpp"
+#include "workloads/NeonBatchNormalizationFloatWorkload.hpp"
+#include "workloads/NeonConvolution2dBaseWorkload.hpp"
+#include "workloads/NeonDepthwiseConvolutionBaseWorkload.hpp"
+#include "workloads/NeonL2NormalizationFloatWorkload.hpp"
+#include "workloads/NeonMultiplicationFloatWorkload.hpp"
+#include "workloads/NeonNormalizationFloatWorkload.hpp"
+#include "workloads/NeonFullyConnectedWorkload.hpp"
+#include "workloads/NeonPermuteWorkload.hpp"
+#include "workloads/NeonPooling2dBaseWorkload.hpp"
+#include "workloads/NeonSoftmaxBaseWorkload.hpp"
+#include "workloads/NeonSubtractionFloatWorkload.hpp"
+#endif
+
+using namespace boost;
+
+namespace armnn
+{
+
+bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc)
+{
+ // See arm_compute::NEDirectConvolutionLayer documentation for the supported cases,
+ // and complement with NEDirectConvolutionLayerKernel::configure() implementation.
+
+ // Only 1x1 is using direct convolution. Performance results and details are in:
+ // https://jira.arm.com/browse/IVGCVSW-1003
+ // Measurements were taken as of clframework: f105ab972135bcd21304883eff040d7e587099bc
+
+ const bool dataTypeSupported = (weightInfo.GetDataType() == armnn::DataType::Float32);
+
+ // Strides: 1|2|3
+ const bool strideSupported = (desc.m_StrideX == 1 || desc.m_StrideX == 2 || desc.m_StrideX == 3) &&
+ (desc.m_StrideY == 1 || desc.m_StrideY == 2 || desc.m_StrideY == 3);
+
+ auto paddingLargerThan = [](const Convolution2dDescriptor& conv2ddesc, unsigned int value)
+ {
+ return conv2ddesc.m_PadLeft > value || conv2ddesc.m_PadRight > value ||
+ conv2ddesc.m_PadTop > value || conv2ddesc.m_PadBottom > value;
+ };
+
+ // Supported sizes and padding.
+ const bool sizeAndPaddingSupported =
+ // Pad > 0 not supported for 1x1 weights.
+ (weightInfo.GetShape()[2] == 1 && weightInfo.GetShape()[3] == 1 && !paddingLargerThan(desc, 0u));
+
+ const bool preferDirectConvolution = dataTypeSupported &&
+ strideSupported &&
+ sizeAndPaddingSupported &&
+ // NEDirectConvolutionLayerKernel doesn't support NULL bias.
+ desc.m_BiasEnabled;
+ return preferDirectConvolution;
+}
+
+bool IsNeonNormalizationDescParamsSupported(std::string* reasonIfUnsupported, const NormalizationDescriptor& parameters)
+{
+ if (parameters.m_NormMethodType != NormalizationAlgorithmMethod::LocalBrightness)
+ {
+ if (reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = "Unsupported normalisation method type, only LocalBrightness is supported";
+ }
+ return false;
+ }
+ if (parameters.m_NormSize % 2 == 0)
+ {
+ if (reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = "Normalization size must be an odd number.";
+ }
+ return false;
+ }
+
+ return true;
+}
+
+bool IsNeonBackendSupported(std::string* reasonIfUnsupported)
+{
+#if ARMCOMPUTENEON_ENABLED
+ return true;
+#else
+ if (reasonIfUnsupported != nullptr)
+ {
+ *reasonIfUnsupported = "The armnn library has been built without NEON support";
+ }
+ return false;
+#endif
+}
+
+template<typename FloatFunc, typename Uint8Func, typename ... Params>
+bool IsSupportedForDataTypeNeon(std::string* reasonIfUnsupported,
+ DataType dataType,
+ FloatFunc floatFuncPtr,
+ Uint8Func uint8FuncPtr,
+ Params&&... params)
+{
+ return IsNeonBackendSupported(reasonIfUnsupported) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ dataType,
+ floatFuncPtr,
+ floatFuncPtr,
+ uint8FuncPtr,
+ std::forward<Params>(params)...);
+}
+
+#if ARMCOMPUTENEON_ENABLED
+template<class FuncType, class... Args>
+inline bool IsWorkloadSupported(FuncType& func, std::string* reasonIfUnsupported, Args&&... args)
+{
+ arm_compute::Status aclStatus = func(std::forward<Args>(args)...);
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+ return supported;
+}
+
+#define FORWARD_WORKLOAD_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \
+ return IsWorkloadSupported(func, reasonIfUnsupported, __VA_ARGS__);
+#else
+#define FORWARD_WORKLOAD_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \
+ return IsNeonBackendSupported(reasonIfUnsupported);
+#endif
+
+bool IsActivationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(descriptor);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonActivationWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor);
+}
+
+bool IsAdditionSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonAdditionWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
+}
+
+bool IsBatchNormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonBatchNormalizationValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ mean,
+ var,
+ beta,
+ gamma,
+ descriptor);
+}
+
+bool IsConstantSupportedNeon(const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeNeon(reasonIfUnsupported,
+ output.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsConvolution2dSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonConvolution2dWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor,
+ weights,
+ biases);
+}
+
+bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonDepthwiseConvolutionWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor,
+ weights,
+ biases);
+}
+
+bool IsDivisionSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ // At the moment division is not supported
+ return false;
+}
+
+bool IsSubtractionSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonSubtractionWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
+}
+
+bool IsFullyConnectedSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ // At the moment U8 is unsupported
+ if (input.GetDataType() == DataType::QuantisedAsymm8)
+ {
+ return false;
+ }
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonFullyConnectedWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ weights,
+ biases,
+ descriptor);
+}
+
+bool IsInputSupportedNeon(const TensorInfo& input,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeNeon(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsL2NormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output);
+}
+
+bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs,
+ const OriginsDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(descriptor);
+ return IsSupportedForDataTypeNeon(reasonIfUnsupported,
+ inputs[0]->GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsMultiplicationSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonMultiplicationWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
+}
+
+bool IsNormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonNormalizationWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
+bool IsOutputSupportedNeon(const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeNeon(reasonIfUnsupported,
+ output.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsPermuteSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const PermuteDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonPermuteWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
+bool IsPooling2dSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonPooling2dWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
+bool IsResizeBilinearSupportedNeon(const TensorInfo& input,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ return false;
+}
+
+bool IsSoftmaxSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonSoftmaxWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
+bool IsSplitterSupportedNeon(const TensorInfo& input,
+ const ViewsDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(descriptor);
+ return IsSupportedForDataTypeNeon(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsFakeQuantizationSupportedNeon(const TensorInfo& input,
+ const FakeQuantizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(descriptor);
+ return false;
+}
+
+bool IsReshapeSupportedNeon(const TensorInfo& input,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeNeon(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsFloorSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(output);
+ return IsNeonBackendSupported(reasonIfUnsupported) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ input.GetDataType(),
+ &FalseFuncF16<>,
+ &TrueFunc<>,
+ &FalseFuncU8<>);
+}
+
+bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(outputStateIn);
+ ignore_unused(cellStateIn);
+ ignore_unused(scratchBuffer);
+ ignore_unused(outputStateOut);
+ ignore_unused(cellStateOut);
+ ignore_unused(output);
+ ignore_unused(descriptor);
+ ignore_unused(inputToForgetWeights);
+ ignore_unused(inputToCellWeights);
+ ignore_unused(inputToOutputWeights);
+ ignore_unused(recurrentToForgetWeights);
+ ignore_unused(recurrentToCellWeights);
+ ignore_unused(recurrentToOutputWeights);
+ ignore_unused(forgetGateBias);
+ ignore_unused(cellBias);
+ ignore_unused(outputGateBias);
+ ignore_unused(inputToInputWeights);
+ ignore_unused(recurrentToInputWeights);
+ ignore_unused(cellToInputWeights);
+ ignore_unused(inputGateBias);
+ ignore_unused(projectionWeights);
+ ignore_unused(projectionBias);
+ ignore_unused(cellToForgetWeights);
+ ignore_unused(cellToOutputWeights);
+ return false;
+}
+
+bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(output);
+ return true;
+}
+
+bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(output);
+ return true;
+}
+
+bool IsMeanSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const MeanDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ return false;
+}
+
+}
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
new file mode 100644
index 0000000000..95b14b3ba6
--- /dev/null
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -0,0 +1,163 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/DescriptorsFwd.hpp>
+#include <armnn/Types.hpp>
+#include <armnn/Tensor.hpp>
+
+#include <boost/optional.hpp>
+
+namespace armnn
+{
+
+bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc);
+
+bool IsNeonNormalizationDescParamsSupported(std::string* reasonIfUnsupported,
+ const NormalizationDescriptor& parameters);
+
+bool IsActivationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor,
+ std::string* reasonIfUnsupported);
+
+bool IsNeonDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported,
+ const DepthwiseConvolution2dDescriptor& parameters,
+ const TensorInfo& weights);
+
+bool IsAdditionSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+
+bool IsBatchNormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConstantSupportedNeon(const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvolution2dSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported = nullptr);
+
+
+bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsDivisionSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsSubtractionSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsFullyConnectedSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsInputSupportedNeon(const TensorInfo& input,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsL2NormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs,
+ const OriginsDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsMultiplicationSupportedNeon(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsNormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsOutputSupportedNeon(const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsPermuteSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const PermuteDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsPooling2dSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsResizeBilinearSupportedNeon(const TensorInfo& input,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsSoftmaxSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsSplitterSupportedNeon(const TensorInfo& input,
+ const ViewsDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsFakeQuantizationSupportedNeon(const TensorInfo& input,
+ const FakeQuantizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsReshapeSupportedNeon(const TensorInfo& input,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsFloorSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsMeanSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const MeanDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+}
diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp
new file mode 100644
index 0000000000..655427859b
--- /dev/null
+++ b/src/backends/neon/NeonTensorHandle.hpp
@@ -0,0 +1,142 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backends/OutputHandler.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/runtime/MemoryGroup.h>
+#include <arm_compute/runtime/IMemoryGroup.h>
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/SubTensor.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Coordinates.h>
+
+#include <boost/polymorphic_pointer_cast.hpp>
+
+namespace armnn
+{
+
+class INeonTensorHandle : public ITensorHandle
+{
+public:
+ virtual arm_compute::ITensor& GetTensor() = 0;
+ virtual arm_compute::ITensor const& GetTensor() const = 0;
+ virtual arm_compute::DataType GetDataType() const = 0;
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) = 0;
+};
+
+class NeonTensorHandle : public INeonTensorHandle
+{
+public:
+ NeonTensorHandle(const TensorInfo& tensorInfo)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo);
+ }
+
+ NeonTensorHandle(const TensorInfo& tensorInfo, DataLayout dataLayout)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout);
+ }
+
+ arm_compute::ITensor& GetTensor() override { return m_Tensor; }
+ arm_compute::ITensor const& GetTensor() const override { return m_Tensor; }
+
+ virtual void Allocate() override
+ {
+ armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);
+ };
+
+ virtual void Manage() override
+ {
+ BOOST_ASSERT(m_MemoryGroup != nullptr);
+ m_MemoryGroup->manage(&m_Tensor);
+ }
+
+ virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; }
+
+ virtual ITensorHandle* GetParent() const override { return nullptr; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
+
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override
+ {
+ m_MemoryGroup = boost::polymorphic_pointer_downcast<arm_compute::MemoryGroup>(memoryGroup);
+ }
+
+ virtual const void* Map(bool /* blocking = true */) const override
+ {
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override {}
+
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+
+private:
+ arm_compute::Tensor m_Tensor;
+ std::shared_ptr<arm_compute::MemoryGroup> m_MemoryGroup;
+};
+
+class NeonSubTensorHandle : public INeonTensorHandle
+{
+public:
+ NeonSubTensorHandle(INeonTensorHandle* parent,
+ const arm_compute::TensorShape& shape,
+ const arm_compute::Coordinates& coords)
+ : m_Tensor(&parent->GetTensor(), shape, coords)
+ {
+ parentHandle = parent;
+ }
+
+ arm_compute::ITensor& GetTensor() override { return m_Tensor; }
+ arm_compute::ITensor const& GetTensor() const override { return m_Tensor; }
+
+ virtual void Allocate() override {}
+ virtual void Manage() override {}
+
+ virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; }
+
+ virtual ITensorHandle* GetParent() const override { return parentHandle; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
+
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+ virtual const void* Map(bool /* blocking = true */) const override
+ {
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override {}
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+private:
+ arm_compute::SubTensor m_Tensor;
+ ITensorHandle* parentHandle = nullptr;
+};
+
+}
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
new file mode 100644
index 0000000000..dd91b152f2
--- /dev/null
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -0,0 +1,494 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "NeonWorkloadFactory.hpp"
+#include <armnn/Utils.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <Layer.hpp>
+
+#ifdef ARMCOMPUTENEON_ENABLED
+#include <arm_compute/runtime/Allocator.h>
+
+#include <backends/MemCopyWorkload.hpp>
+#include "NeonTensorHandle.hpp"
+#include "workloads/NeonWorkloadUtils.hpp"
+#include "workloads/NeonWorkloads.hpp"
+
+#include <memory/IPoolManager.hpp>
+#endif
+
+#include <backends/MakeWorkloadHelper.hpp>
+
+#include <boost/polymorphic_cast.hpp>
+
+namespace armnn
+{
+
+bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported)
+{
+ return IWorkloadFactory::IsLayerSupported(Compute::CpuAcc, layer, dataType, outReasonIfUnsupported);
+}
+
+#ifdef ARMCOMPUTENEON_ENABLED
+
+NeonWorkloadFactory::NeonWorkloadFactory()
+ : m_MemoryManager(std::make_unique<arm_compute::Allocator>(), BaseMemoryManager::MemoryAffinity::Offset)
+{
+}
+
+std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateSubTensorHandle(ITensorHandle& parent,
+ TensorShape const& subTensorShape,
+ unsigned int const* subTensorOrigin) const
+{
+ BOOST_ASSERT(parent.GetType() == ITensorHandle::Neon);
+
+ const arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape);
+
+ arm_compute::Coordinates coords;
+ coords.set_num_dimensions(subTensorShape.GetNumDimensions());
+ for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++)
+ {
+ // Arm compute indexes tensor coords in reverse order.
+ unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1;
+ coords.set(i, boost::numeric_cast<int>(subTensorOrigin[revertedIndex]));
+ }
+
+ return std::make_unique<NeonSubTensorHandle>(
+ boost::polymorphic_downcast<INeonTensorHandle*>(&parent), shape, coords);
+}
+
+std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
+{
+ auto tensorHandle = std::make_unique<NeonTensorHandle>(tensorInfo);
+ tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup());
+
+ return tensorHandle;
+}
+
+std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const
+{
+ auto tensorHandle = std::make_unique<NeonTensorHandle>(tensorInfo, dataLayout);
+ tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup());
+
+ return tensorHandle;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonActivationFloatWorkload, NeonActivationUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonSoftmaxFloatWorkload, NeonSoftmaxUint8Workload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonSplitterFloatWorkload, NeonSplitterUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMerger(const MergerQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonMergerFloatWorkload, NeonMergerUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateFullyConnected(
+ const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonFullyConnectedWorkload, NeonFullyConnectedWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonPermuteFloatWorkload, NeonPermuteUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonPooling2dFloatWorkload, NeonPooling2dUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateConvolution2d(
+ const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonConvolution2dFloatWorkload, NeonConvolution2dUint8Workload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDepthwiseConvolution2d(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonDepthwiseConvolutionFloatWorkload, NeonDepthwiseConvolutionUint8Workload>(
+ descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateNormalization(
+ const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonNormalizationFloatWorkload, NullWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonAdditionFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMultiplication(
+ const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonMultiplicationFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateDivision(
+ const DivisionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateSubtraction(
+ const SubtractionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonSubtractionFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateBatchNormalization(
+ const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonBatchNormalizationFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ if (descriptor.m_Inputs.empty() || !descriptor.m_Inputs[0])
+ {
+ throw InvalidArgumentException("NeonWorkloadFactory: Invalid null input for MemCopy workload");
+ }
+
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateResizeBilinear(
+ const ResizeBilinearQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFakeQuantization(
+ const FakeQuantizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonL2NormalizationFloatWorkload, NullWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonConstantFloatWorkload, NeonConstantUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonReshapeFloatWorkload, NeonReshapeUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonFloorFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonLstmFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertFp32ToFp16Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMean(const MeanQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreatePad(const PadQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+}
+
+void NeonWorkloadFactory::Finalize()
+{
+ m_MemoryManager.Finalize();
+}
+
+void NeonWorkloadFactory::Release()
+{
+ m_MemoryManager.Release();
+}
+
+void NeonWorkloadFactory::Acquire()
+{
+ m_MemoryManager.Acquire();
+}
+
+#else // Compiled without ArmCompute libs
+
+NeonWorkloadFactory::NeonWorkloadFactory()
+{
+}
+
+std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateSubTensorHandle(ITensorHandle& parent,
+ TensorShape const& subTensorShape,
+ unsigned int const* subTensorOrigin) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMerger(const MergerQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFullyConnected(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvolution2d(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDepthwiseConvolution2d(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateNormalization(const NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateBatchNormalization(const BatchNormalizationQueueDescriptor& data,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMultiplication(const MultiplicationQueueDescriptor& data,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFakeQuantization(
+ const FakeQuantizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDivision(const DivisionQueueDescriptor& data,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSubtraction(const SubtractionQueueDescriptor& data,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMean(const MeanQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreatePad(const PadQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+void NeonWorkloadFactory::Finalize()
+{}
+
+void NeonWorkloadFactory::Release()
+{}
+
+void NeonWorkloadFactory::Acquire()
+{}
+
+#endif
+
+} //namespace armnn
diff --git a/src/backends/neon/NeonWorkloadFactory.hpp b/src/backends/neon/NeonWorkloadFactory.hpp
new file mode 100644
index 0000000000..440bba672a
--- /dev/null
+++ b/src/backends/neon/NeonWorkloadFactory.hpp
@@ -0,0 +1,138 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backends/OutputHandler.hpp>
+
+#include <memory/BaseMemoryManager.hpp>
+
+#include <boost/core/ignore_unused.hpp>
+#include <boost/optional.hpp>
+
+namespace armnn
+{
+
+// Neon workload factory.
+class NeonWorkloadFactory : public IWorkloadFactory
+{
+public:
+ NeonWorkloadFactory();
+
+ virtual Compute GetCompute() const override { return Compute::CpuAcc; }
+
+ static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported);
+
+ virtual bool SupportsSubTensors() const override { return true; }
+
+ virtual std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& parent,
+ TensorShape const& subTensorShape,
+ unsigned int const* subTensorOrigin) const override;
+
+ virtual std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo) const override;
+
+ virtual std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateInput(const InputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateOutput(const OutputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateActivation(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateSplitter(const SplitterQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMerger(const MergerQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateFullyConnected(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreatePermute(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvolution2d(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateDepthwiseConvolution2d(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateNormalization(const NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMultiplication(const MultiplicationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateAddition(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateBatchNormalization(const BatchNormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateFakeQuantization(const FakeQuantizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMean(const MeanQueueDescriptor& descriptor,
+ const WorkloadInfo& Info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreatePad(const PadQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual void Finalize() override;
+
+ virtual void Release() override;
+
+ virtual void Acquire() override;
+
+private:
+#ifdef ARMCOMPUTENEON_ENABLED
+ mutable NeonMemoryManager m_MemoryManager;
+#endif
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/backend.cmake b/src/backends/neon/backend.cmake
new file mode 100644
index 0000000000..5f02c845ed
--- /dev/null
+++ b/src/backends/neon/backend.cmake
@@ -0,0 +1,13 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+if(ARMCOMPUTENEON)
+ add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/neon)
+ list(APPEND armnnLibraries armnnNeonBackend armnnNeonBackendWorkloads)
+else()
+ message("NEON backend is disabled")
+ add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/neon)
+ list(APPEND armnnLibraries armnnNeonBackend)
+endif()
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
new file mode 100644
index 0000000000..a59966fb39
--- /dev/null
+++ b/src/backends/neon/backend.mk
@@ -0,0 +1,47 @@
+#
+# Copyright © 2017 ARM Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+# BACKEND_SOURCES contains the list of files to be included
+# in the Android build and it is picked up by the Android.mk
+# file in the root of ArmNN
+
+BACKEND_SOURCES := \
+ NeonLayerSupport.cpp \
+ NeonWorkloadFactory.cpp \
+ workloads/NeonActivationFloatWorkload.cpp \
+ workloads/NeonActivationUint8Workload.cpp \
+ workloads/NeonAdditionFloatWorkload.cpp \
+ workloads/NeonBatchNormalizationFloatWorkload.cpp \
+ workloads/NeonConstantFloatWorkload.cpp \
+ workloads/NeonConstantUint8Workload.cpp \
+ workloads/NeonConvertFp16ToFp32Workload.cpp \
+ workloads/NeonConvertFp32ToFp16Workload.cpp \
+ workloads/NeonConvolution2dBaseWorkload.cpp \
+ workloads/NeonConvolution2dFloatWorkload.cpp \
+ workloads/NeonConvolution2dUint8Workload.cpp \
+ workloads/NeonDepthwiseConvolutionBaseWorkload.cpp \
+ workloads/NeonDepthwiseConvolutionFloatWorkload.cpp \
+ workloads/NeonDepthwiseConvolutionUint8Workload.cpp \
+ workloads/NeonFloorFloatWorkload.cpp \
+ workloads/NeonFullyConnectedWorkload.cpp \
+ workloads/NeonL2NormalizationFloatWorkload.cpp \
+ workloads/NeonLstmFloatWorkload.cpp \
+ workloads/NeonMergerFloatWorkload.cpp \
+ workloads/NeonMergerUint8Workload.cpp \
+ workloads/NeonMultiplicationFloatWorkload.cpp \
+ workloads/NeonNormalizationFloatWorkload.cpp \
+ workloads/NeonPermuteWorkload.cpp \
+ workloads/NeonPooling2dBaseWorkload.cpp \
+ workloads/NeonPooling2dFloatWorkload.cpp \
+ workloads/NeonPooling2dUint8Workload.cpp \
+ workloads/NeonReshapeFloatWorkload.cpp \
+ workloads/NeonReshapeUint8Workload.cpp \
+ workloads/NeonSoftmaxBaseWorkload.cpp \
+ workloads/NeonSoftmaxFloatWorkload.cpp \
+ workloads/NeonSoftmaxUint8Workload.cpp \
+ workloads/NeonSplitterFloatWorkload.cpp \
+ workloads/NeonSplitterUint8Workload.cpp \
+ workloads/NeonSubtractionFloatWorkload.cpp \
+ workloads/NeonWorkloadUtils.cpp
diff --git a/src/backends/neon/test/CMakeLists.txt b/src/backends/neon/test/CMakeLists.txt
new file mode 100644
index 0000000000..f41a074999
--- /dev/null
+++ b/src/backends/neon/test/CMakeLists.txt
@@ -0,0 +1,4 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
new file mode 100644
index 0000000000..850c65cb4e
--- /dev/null
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -0,0 +1,86 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+list(APPEND armnnNeonBackendWorkloads_sources
+ NeonActivationFloatWorkload.cpp
+ NeonActivationFloatWorkload.hpp
+ NeonActivationUint8Workload.cpp
+ NeonActivationUint8Workload.hpp
+ NeonAdditionFloatWorkload.cpp
+ NeonAdditionFloatWorkload.hpp
+ NeonBaseConstantWorkload.hpp
+ NeonBaseMergerWorkload.hpp
+ NeonBaseSplitterWorkload.hpp
+ NeonBatchNormalizationFloatWorkload.cpp
+ NeonBatchNormalizationFloatWorkload.hpp
+ NeonConstantFloatWorkload.cpp
+ NeonConstantFloatWorkload.hpp
+ NeonConstantUint8Workload.cpp
+ NeonConstantUint8Workload.hpp
+ NeonConvertFp16ToFp32Workload.cpp
+ NeonConvertFp16ToFp32Workload.hpp
+ NeonConvertFp32ToFp16Workload.cpp
+ NeonConvertFp32ToFp16Workload.hpp
+ NeonConvolution2dBaseWorkload.cpp
+ NeonConvolution2dBaseWorkload.hpp
+ NeonConvolution2dFloatWorkload.cpp
+ NeonConvolution2dFloatWorkload.hpp
+ NeonConvolution2dUint8Workload.cpp
+ NeonConvolution2dUint8Workload.hpp
+ NeonDepthwiseConvolutionBaseWorkload.cpp
+ NeonDepthwiseConvolutionBaseWorkload.hpp
+ NeonDepthwiseConvolutionFloatWorkload.cpp
+ NeonDepthwiseConvolutionFloatWorkload.hpp
+ NeonDepthwiseConvolutionUint8Workload.cpp
+ NeonDepthwiseConvolutionUint8Workload.hpp
+ NeonFloorFloatWorkload.cpp
+ NeonFloorFloatWorkload.hpp
+ NeonFullyConnectedWorkload.cpp
+ NeonFullyConnectedWorkload.hpp
+ NeonL2NormalizationFloatWorkload.cpp
+ NeonL2NormalizationFloatWorkload.hpp
+ NeonLstmFloatWorkload.cpp
+ NeonLstmFloatWorkload.hpp
+ NeonMergerFloatWorkload.cpp
+ NeonMergerFloatWorkload.hpp
+ NeonMergerUint8Workload.cpp
+ NeonMergerUint8Workload.hpp
+ NeonMultiplicationFloatWorkload.cpp
+ NeonMultiplicationFloatWorkload.hpp
+ NeonNormalizationFloatWorkload.cpp
+ NeonNormalizationFloatWorkload.hpp
+ NeonPermuteWorkload.cpp
+ NeonPermuteWorkload.hpp
+ NeonPooling2dBaseWorkload.cpp
+ NeonPooling2dBaseWorkload.hpp
+ NeonPooling2dFloatWorkload.cpp
+ NeonPooling2dFloatWorkload.hpp
+ NeonPooling2dUint8Workload.cpp
+ NeonPooling2dUint8Workload.hpp
+ NeonReshapeFloatWorkload.cpp
+ NeonReshapeFloatWorkload.hpp
+ NeonReshapeUint8Workload.cpp
+ NeonReshapeUint8Workload.hpp
+ NeonSoftmaxBaseWorkload.cpp
+ NeonSoftmaxBaseWorkload.hpp
+ NeonSoftmaxFloatWorkload.cpp
+ NeonSoftmaxFloatWorkload.hpp
+ NeonSoftmaxUint8Workload.cpp
+ NeonSoftmaxUint8Workload.hpp
+ NeonSplitterFloatWorkload.cpp
+ NeonSplitterFloatWorkload.hpp
+ NeonSplitterUint8Workload.cpp
+ NeonSplitterUint8Workload.hpp
+ NeonSubtractionFloatWorkload.cpp
+ NeonSubtractionFloatWorkload.hpp
+ NeonWorkloads.hpp
+ NeonWorkloadUtils.cpp
+ NeonWorkloadUtils.hpp
+)
+
+add_library(armnnNeonBackendWorkloads STATIC ${armnnNeonBackendWorkloads_sources})
+target_include_directories(armnnNeonBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(armnnNeonBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnNeonBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
diff --git a/src/backends/neon/workloads/NeonActivationFloatWorkload.cpp b/src/backends/neon/workloads/NeonActivationFloatWorkload.cpp
new file mode 100644
index 0000000000..1d6bf70431
--- /dev/null
+++ b/src/backends/neon/workloads/NeonActivationFloatWorkload.cpp
@@ -0,0 +1,57 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonActivationFloatWorkload.hpp"
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+
+
+namespace armnn
+{
+
+arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ const arm_compute::ActivationLayerInfo activationLayerInfo =
+ ConvertActivationDescriptorToAclActivationLayerInfo(descriptor);
+
+ if (input.GetDataType() == DataType::QuantisedAsymm8 &&
+ activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR,
+ "Neon: Logistic Activations unsupported with QAsymm8 data type."};
+ }
+
+ return arm_compute::NEActivationLayer::validate(&aclInput,
+ &aclOutput,
+ activationLayerInfo);
+}
+
+NeonActivationFloatWorkload::NeonActivationFloatWorkload(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<ActivationQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonActivationFloatWorkload", 1, 1);
+
+ const arm_compute::ActivationLayerInfo activationLayerInfo =
+ ConvertActivationDescriptorToAclActivationLayerInfo(m_Data.m_Parameters);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_ActivationLayer.configure(&input, &output, activationLayerInfo);
+}
+
+void NeonActivationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationFloatWorkload_Execute");
+ m_ActivationLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonActivationFloatWorkload.hpp b/src/backends/neon/workloads/NeonActivationFloatWorkload.hpp
new file mode 100644
index 0000000000..4d2f51fb4f
--- /dev/null
+++ b/src/backends/neon/workloads/NeonActivationFloatWorkload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor);
+
+class NeonActivationFloatWorkload : public FloatWorkload<ActivationQueueDescriptor>
+{
+public:
+ NeonActivationFloatWorkload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::NEActivationLayer m_ActivationLayer;
+};
+} //namespace armnn
+
+
+
diff --git a/src/backends/neon/workloads/NeonActivationUint8Workload.cpp b/src/backends/neon/workloads/NeonActivationUint8Workload.cpp
new file mode 100644
index 0000000000..4aed6b510f
--- /dev/null
+++ b/src/backends/neon/workloads/NeonActivationUint8Workload.cpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonActivationUint8Workload.hpp"
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/neon/NeonLayerSupport.hpp>
+
+namespace armnn
+{
+NeonActivationUint8Workload::NeonActivationUint8Workload(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Uint8Workload<ActivationQueueDescriptor>(descriptor, info)
+{
+ auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function);
+ arm_compute::ActivationLayerInfo layerInfo(activation,
+ m_Data.m_Parameters.m_A,
+ m_Data.m_Parameters.m_B);
+
+ m_Data.ValidateInputsOutputs("NeonActivationUint8Workload", 1, 1);
+
+ arm_compute::ITensor& input = static_cast<NeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = static_cast<NeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_ActivationLayer.configure(&input, &output, layerInfo);
+}
+
+void NeonActivationUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationUint8Workload_Execute");
+
+ m_ActivationLayer.run();
+}
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonActivationUint8Workload.hpp b/src/backends/neon/workloads/NeonActivationUint8Workload.hpp
new file mode 100644
index 0000000000..56e3544379
--- /dev/null
+++ b/src/backends/neon/workloads/NeonActivationUint8Workload.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonActivationUint8Workload : public Uint8Workload<ActivationQueueDescriptor>
+{
+public:
+ NeonActivationUint8Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEActivationLayer m_ActivationLayer;
+};
+
+} //namespace armnn
+
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonAdditionFloatWorkload.cpp b/src/backends/neon/workloads/NeonAdditionFloatWorkload.cpp
new file mode 100644
index 0000000000..445e32ea44
--- /dev/null
+++ b/src/backends/neon/workloads/NeonAdditionFloatWorkload.cpp
@@ -0,0 +1,48 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonAdditionFloatWorkload.hpp"
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::NEArithmeticAddition::validate(&aclInput0,
+ &aclInput1,
+ &aclOutput,
+ arm_compute::ConvertPolicy::SATURATE);
+}
+
+
+NeonAdditionFloatWorkload::NeonAdditionFloatWorkload(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<AdditionQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonAdditionFloatWorkload", 2, 1);
+
+ arm_compute::ITensor& input1 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& input2 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_AddLayer.configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE);
+}
+
+void NeonAdditionFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonAdditionFloatWorkload_Execute");
+ m_AddLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonAdditionFloatWorkload.hpp b/src/backends/neon/workloads/NeonAdditionFloatWorkload.hpp
new file mode 100644
index 0000000000..769492e949
--- /dev/null
+++ b/src/backends/neon/workloads/NeonAdditionFloatWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
+
+class NeonAdditionFloatWorkload : public FloatWorkload<AdditionQueueDescriptor>
+{
+public:
+ NeonAdditionFloatWorkload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEArithmeticAddition m_AddLayer;
+};
+
+} //namespace armnn
+
+
+
diff --git a/src/backends/neon/workloads/NeonBaseConstantWorkload.hpp b/src/backends/neon/workloads/NeonBaseConstantWorkload.hpp
new file mode 100644
index 0000000000..6bb275ac13
--- /dev/null
+++ b/src/backends/neon/workloads/NeonBaseConstantWorkload.hpp
@@ -0,0 +1,82 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <arm_compute/core/Types.h>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/neon/NeonTensorHandle.hpp>
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/Workload.hpp>
+#include <Half.hpp>
+
+#include <boost/cast.hpp>
+
+namespace armnn
+{
+
+// Base class template providing an implementation of the Constant layer common to all data types.
+template <armnn::DataType... DataFormats>
+class NeonBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataFormats...>
+{
+public:
+ NeonBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : TypedWorkload<ConstantQueueDescriptor, DataFormats...>(descriptor, info)
+ , m_RanOnce(false)
+ {
+ }
+
+ virtual void Execute() const override
+ {
+ using namespace armcomputetensorutils;
+
+ // The intermediate tensor held by the corresponding layer output handler can be initialised with the
+ // given data on the first inference, then reused for subsequent inferences.
+ // The initialisation cannot happen at workload construction time since the ACL kernel for the next layer
+ // may not have been configured at the time.
+ if (!m_RanOnce)
+ {
+ const ConstantQueueDescriptor& data = this->m_Data;
+
+ BOOST_ASSERT(data.m_LayerOutput != nullptr);
+ arm_compute::ITensor& output =
+ boost::polymorphic_downcast<NeonTensorHandle*>(data.m_Outputs[0])->GetTensor();
+ arm_compute::DataType computeDataType =
+ boost::polymorphic_downcast<NeonTensorHandle*>(data.m_Outputs[0])->GetDataType();
+
+ switch (computeDataType)
+ {
+ case arm_compute::DataType::F16:
+ {
+ CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<Half>(), output);
+ break;
+ }
+ case arm_compute::DataType::F32:
+ {
+ CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<float>(), output);
+ break;
+ }
+ case arm_compute::DataType::QASYMM8:
+ {
+ CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<uint8_t>(), output);
+ break;
+ }
+ default:
+ {
+ BOOST_ASSERT_MSG(false, "Unknown data type");
+ break;
+ }
+ }
+
+ m_RanOnce = true;
+ }
+ }
+
+private:
+ mutable bool m_RanOnce;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonBaseMergerWorkload.hpp b/src/backends/neon/workloads/NeonBaseMergerWorkload.hpp
new file mode 100644
index 0000000000..9ff09f6c7c
--- /dev/null
+++ b/src/backends/neon/workloads/NeonBaseMergerWorkload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include <backends/Workload.hpp>
+
+namespace armnn
+{
+// Base class template providing an implementation of the Merger layer common to all data types.
+template <armnn::DataType... DataTypes>
+class NeonBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataTypes...>
+{
+public:
+ using TypedWorkload<MergerQueueDescriptor, DataTypes...>::TypedWorkload;
+
+ virtual void Execute() const override
+ {
+ // With subtensors, merger is a no-op.
+ }
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonBaseSplitterWorkload.hpp b/src/backends/neon/workloads/NeonBaseSplitterWorkload.hpp
new file mode 100644
index 0000000000..dcee93363d
--- /dev/null
+++ b/src/backends/neon/workloads/NeonBaseSplitterWorkload.hpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+// Base class template providing an implementation of the Splitter layer common to all data types.
+template <armnn::DataType... DataTypes>
+class NeonBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataTypes...>
+{
+public:
+ using TypedWorkload<SplitterQueueDescriptor, DataTypes...>::TypedWorkload;
+
+ virtual void Execute() const override
+ {
+ // With subtensors, splitter is a no-op.
+ }
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.cpp
new file mode 100644
index 0000000000..2383e78df3
--- /dev/null
+++ b/src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.cpp
@@ -0,0 +1,96 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonBatchNormalizationFloatWorkload.hpp"
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <armnn/ArmNN.hpp>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+
+arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean);
+ const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var);
+ const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta);
+ const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma);
+
+ return arm_compute::NEBatchNormalizationLayer::validate(&aclInputInfo,
+ &aclOutputInfo,
+ &aclMeanInfo,
+ &aclVarInfo,
+ &aclBetaInfo,
+ &aclGammaInfo,
+ descriptor.m_Eps);
+}
+
+NeonBatchNormalizationFloatWorkload::NeonBatchNormalizationFloatWorkload(
+ const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonBatchNormalizationFloatWorkload", 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_Mean = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo());
+
+ m_Variance = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo());
+
+ m_Gamma = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo());
+
+ m_Beta = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo());
+
+ m_Layer.configure(&input,
+ &output,
+ m_Mean.get(),
+ m_Variance.get(),
+ m_Beta.get(),
+ m_Gamma.get(),
+ m_Data.m_Parameters.m_Eps);
+
+ InitializeArmComputeTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean);
+ InitializeArmComputeTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance);
+ InitializeArmComputeTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma);
+ InitializeArmComputeTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta);
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_Layer.prepare();
+ FreeUnusedTensors();
+}
+
+void NeonBatchNormalizationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonBatchNormalizationFloatWorkload_Execute");
+ m_Layer.run();
+}
+
+void NeonBatchNormalizationFloatWorkload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_Mean);
+ FreeTensorIfUnused(m_Variance);
+ FreeTensorIfUnused(m_Gamma);
+ FreeTensorIfUnused(m_Beta);
+}
+
+} //namespace armnn
+
+
diff --git a/src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.hpp
new file mode 100644
index 0000000000..59c7404c44
--- /dev/null
+++ b/src/backends/neon/workloads/NeonBatchNormalizationFloatWorkload.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor);
+
+class NeonBatchNormalizationFloatWorkload : public FloatWorkload<BatchNormalizationQueueDescriptor>
+{
+public:
+ NeonBatchNormalizationFloatWorkload(const BatchNormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEBatchNormalizationLayer m_Layer;
+
+ std::unique_ptr<arm_compute::Tensor> m_Mean;
+ std::unique_ptr<arm_compute::Tensor> m_Variance;
+ std::unique_ptr<arm_compute::Tensor> m_Gamma;
+ std::unique_ptr<arm_compute::Tensor> m_Beta;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
+
+
+
diff --git a/src/backends/neon/workloads/NeonConstantFloatWorkload.cpp b/src/backends/neon/workloads/NeonConstantFloatWorkload.cpp
new file mode 100644
index 0000000000..dbdd057101
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConstantFloatWorkload.cpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConstantFloatWorkload.hpp"
+
+namespace armnn
+{
+
+void NeonConstantFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantFloatWorkload_Execute");
+ NeonBaseConstantWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConstantFloatWorkload.hpp b/src/backends/neon/workloads/NeonConstantFloatWorkload.hpp
new file mode 100644
index 0000000000..c35b5fda3e
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConstantFloatWorkload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseConstantWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonConstantFloatWorkload : public NeonBaseConstantWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ using NeonBaseConstantWorkload<DataType::Float16, DataType::Float32>::NeonBaseConstantWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConstantUint8Workload.cpp b/src/backends/neon/workloads/NeonConstantUint8Workload.cpp
new file mode 100644
index 0000000000..c607d86844
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConstantUint8Workload.cpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConstantUint8Workload.hpp"
+
+namespace armnn
+{
+
+void NeonConstantUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantUint8Workload_Execute");
+ NeonBaseConstantWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConstantUint8Workload.hpp b/src/backends/neon/workloads/NeonConstantUint8Workload.hpp
new file mode 100644
index 0000000000..2cb9516afe
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConstantUint8Workload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseConstantWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonConstantUint8Workload : public NeonBaseConstantWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ using NeonBaseConstantWorkload<DataType::QuantisedAsymm8>::NeonBaseConstantWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
new file mode 100644
index 0000000000..86ec31c71d
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
@@ -0,0 +1,41 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertFp16ToFp32Workload.hpp"
+#include "Half.hpp"
+#include "FloatingPointConverter.hpp"
+
+#include "backends/WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+NeonConvertFp16ToFp32Workload::NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertFp16ToFp32Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertFp16ToFp32Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp16ToFp32Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const Half*>(src);
+ auto output = reinterpret_cast<float*>(dst);
+ size_t numElements = size/2; // 2 bytes per fp16
+ armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
new file mode 100644
index 0000000000..dcf6998c64
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <backends/WorkloadData.hpp>
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>
+{
+public:
+ NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
new file mode 100644
index 0000000000..0f4fbe4e93
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertFp32ToFp16Workload.hpp"
+
+#include "Half.hpp"
+#include "FloatingPointConverter.hpp"
+
+#include "Profiling.hpp"
+#include "backends/WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+NeonConvertFp32ToFp16Workload::NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToFp16Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertFp32ToFp16Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToFp16Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const float*>(src);
+ auto output = reinterpret_cast<Half*>(dst);
+ size_t numElements = size/2; // 2 bytes per fp16
+ armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
new file mode 100644
index 0000000000..b819a8c542
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <backends/WorkloadData.hpp>
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>
+{
+public:
+ NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp b/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp
new file mode 100644
index 0000000000..547f563d59
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp
@@ -0,0 +1,146 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/neon/NeonLayerSupport.hpp>
+
+#include "NeonConvolution2dBaseWorkload.hpp"
+
+#include <armnn/Types.hpp>
+#include <Half.hpp>
+
+namespace armnn
+{
+
+using namespace armcomputetensorutils;
+
+arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+
+ arm_compute::TensorInfo aclBiasesInfo;
+ arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
+
+ if (descriptor.m_BiasEnabled)
+ {
+ BOOST_ASSERT(biases.is_initialized());
+
+ aclBiasesInfo = BuildArmComputeTensorInfo(biases.get(), descriptor.m_DataLayout);
+ optionalAclBiasesInfo = &aclBiasesInfo;
+ }
+
+ arm_compute::PadStrideInfo layerInfo = BuildArmComputePadStrideInfo(descriptor);
+
+ return arm_compute::NEConvolutionLayer::validate(&aclInputInfo,
+ &aclWeightsInfo,
+ optionalAclBiasesInfo,
+ &aclOutputInfo,
+ layerInfo);
+}
+
+template<armnn::DataType... dataTypes>
+NeonConvolution2dBaseWorkload<dataTypes...>::NeonConvolution2dBaseWorkload(
+ const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>(descriptor, info)
+{
+ using arm_compute::NEDirectConvolutionLayer;
+
+ ValidateData();
+
+ // todo: check tensor shapes match.
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_KernelTensor, m_Data.m_Weight->GetTensorInfo(), descriptor.m_DataLayout);
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), descriptor.m_DataLayout);
+ }
+
+ arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
+ m_Data.m_Parameters.m_StrideY,
+ m_Data.m_Parameters.m_PadLeft,
+ m_Data.m_Parameters.m_PadRight,
+ m_Data.m_Parameters.m_PadTop,
+ m_Data.m_Parameters.m_PadBottom,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ const bool preferDirectConvolution =
+ IsNeonDirectConvolutionPreferred(m_Data.m_Weight->GetTensorInfo(),
+ m_Data.m_Parameters);
+
+ if (preferDirectConvolution)
+ {
+ auto directConvolutionLayer = std::make_unique<arm_compute::NEDirectConvolutionLayer>(memoryManager);
+ directConvolutionLayer->configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+ m_ConvolutionLayer.reset(directConvolutionLayer.release());
+ }
+ else
+ {
+ auto convolutionLayer = std::make_unique<arm_compute::NEConvolutionLayer>(memoryManager);
+ convolutionLayer->configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+ m_ConvolutionLayer.reset(convolutionLayer.release());
+ }
+ BOOST_ASSERT(m_ConvolutionLayer);
+
+ armnn::DataType dataType = m_Data.m_Weight->GetTensorInfo().GetDataType();
+
+ switch (dataType)
+ {
+ case DataType::Float16:
+ {
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<Half>());
+ break;
+ }
+ case DataType::Float32:
+ {
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<float>());
+ break;
+ }
+ case DataType::QuantisedAsymm8:
+ {
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<uint8_t>());
+ break;
+ }
+ default:
+ {
+ BOOST_ASSERT_MSG(false, "Unknown DataType.");
+ }
+ }
+}
+
+template<armnn::DataType... dataTypes>
+void NeonConvolution2dBaseWorkload<dataTypes...>::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
+// Generates known implementations for linker.
+template class NeonConvolution2dBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>;
+template class NeonConvolution2dBaseWorkload<armnn::DataType::QuantisedAsymm8>;
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.hpp b/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.hpp
new file mode 100644
index 0000000000..6af89c1f01
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.hpp
@@ -0,0 +1,49 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/neon/NeonLayerSupport.hpp>
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <boost/optional.hpp>
+
+#include <memory>
+
+namespace armnn
+{
+
+arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases);
+
+template<armnn::DataType... dataTypes>
+class NeonConvolution2dBaseWorkload : public TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>
+{
+public:
+ using TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>::m_Data;
+
+ NeonConvolution2dBaseWorkload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+
+ virtual void ValidateData() const {};
+
+protected:
+ std::unique_ptr<arm_compute::IFunction> m_ConvolutionLayer;
+
+ std::unique_ptr<arm_compute::Tensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvolution2dFloatWorkload.cpp b/src/backends/neon/workloads/NeonConvolution2dFloatWorkload.cpp
new file mode 100644
index 0000000000..cd26f8d536
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvolution2dFloatWorkload.cpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvolution2dFloatWorkload.hpp"
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/neon/NeonLayerSupport.hpp>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+NeonConvolution2dFloatWorkload::NeonConvolution2dFloatWorkload(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : NeonConvolution2dBaseWorkload(descriptor, info, memoryManager)
+{
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias);
+ }
+
+ m_ConvolutionLayer->prepare();
+ FreeUnusedTensors();
+}
+
+void NeonConvolution2dFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dFloatWorkload_Execute");
+ m_ConvolutionLayer->run();
+}
+
+void NeonConvolution2dFloatWorkload::ValidateData() const
+{
+ m_Data.ValidateInputsOutputs("NeonConvolution2dFloatWorkload", 1, 1);
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonConvolution2dFloatWorkload.hpp b/src/backends/neon/workloads/NeonConvolution2dFloatWorkload.hpp
new file mode 100644
index 0000000000..14c77c8bd0
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvolution2dFloatWorkload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonConvolution2dBaseWorkload.hpp"
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+
+#include <memory>
+
+namespace armnn
+{
+
+class NeonConvolution2dFloatWorkload : public NeonConvolution2dBaseWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ NeonConvolution2dFloatWorkload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+
+ void Execute() const override;
+ void ValidateData() const override;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonConvolution2dUint8Workload.cpp b/src/backends/neon/workloads/NeonConvolution2dUint8Workload.cpp
new file mode 100644
index 0000000000..5affe682b4
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvolution2dUint8Workload.cpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvolution2dUint8Workload.hpp"
+
+namespace armnn
+{
+
+NeonConvolution2dUint8Workload::NeonConvolution2dUint8Workload(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : NeonConvolution2dBaseWorkload(descriptor, info, memoryManager)
+{
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>());
+ }
+
+ m_ConvolutionLayer->prepare();
+ FreeUnusedTensors();
+}
+
+void NeonConvolution2dUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dUint8Workload_Execute");
+ m_ConvolutionLayer->run();
+}
+
+void NeonConvolution2dUint8Workload::ValidateData() const
+{
+ m_Data.ValidateInputsOutputs("NeonConvolution2dUint8Workload", 1, 1);
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvolution2dUint8Workload.hpp b/src/backends/neon/workloads/NeonConvolution2dUint8Workload.hpp
new file mode 100644
index 0000000000..ef60fc3e84
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvolution2dUint8Workload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonConvolution2dBaseWorkload.hpp"
+
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+
+#include <memory>
+
+namespace armnn
+{
+
+class NeonConvolution2dUint8Workload : public NeonConvolution2dBaseWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ NeonConvolution2dUint8Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+
+ virtual void ValidateData() const override;
+ virtual void Execute() const override;
+private:
+};
+
+} //namespace armnnn
+
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.cpp
new file mode 100644
index 0000000000..ef60b3238d
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.cpp
@@ -0,0 +1,49 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonDepthwiseConvolutionBaseWorkload.hpp"
+
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases)
+{
+ const arm_compute::TensorInfo aclInputInfo =
+ armcomputetensorutils::BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclOutputInfo =
+ armcomputetensorutils::BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclWeightsInfo =
+ armcomputetensorutils::BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+
+ arm_compute::TensorInfo aclBiasesInfo;
+ arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
+
+ if (descriptor.m_BiasEnabled)
+ {
+ BOOST_ASSERT(biases.is_initialized());
+
+ aclBiasesInfo = armcomputetensorutils::BuildArmComputeTensorInfo(biases.get(), descriptor.m_DataLayout);
+ optionalAclBiasesInfo = &aclBiasesInfo;
+ }
+
+ const arm_compute::PadStrideInfo aclPadStrideInfo =
+ armcomputetensorutils::BuildArmComputePadStrideInfo(descriptor);
+ const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+ return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo,
+ &aclWeightsInfo,
+ optionalAclBiasesInfo,
+ &aclOutputInfo,
+ aclPadStrideInfo,
+ aclDepthMultiplier);
+}
+
+} \ No newline at end of file
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.hpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.hpp
new file mode 100644
index 0000000000..982992a363
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionBaseWorkload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+#include <boost/optional.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases);
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp
new file mode 100644
index 0000000000..742a768b94
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp
@@ -0,0 +1,93 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonDepthwiseConvolutionFloatWorkload.hpp"
+#include <backends/neon/NeonLayerSupport.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+NeonDepthwiseConvolutionFloatWorkload::NeonDepthwiseConvolutionFloatWorkload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
+{
+ const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
+
+ m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo, descriptor.m_DataLayout);
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), descriptor.m_DataLayout);
+ }
+
+ arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
+ m_Data.m_Parameters.m_StrideY,
+ m_Data.m_Parameters.m_PadLeft,
+ m_Data.m_Parameters.m_PadRight,
+ m_Data.m_Parameters.m_PadTop,
+ m_Data.m_Parameters.m_PadBottom,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ m_Data.ValidateInputsOutputs("NeonDepthwiseConvolutionFloatWorkload", 1, 1);
+
+ arm_compute::ITensor& input = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ bool use3x3Optimisation = weightInfo.GetShape()[3] == 3 && weightInfo.GetShape()[2] == 3;
+ if (use3x3Optimisation)
+ {
+ m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
+ static_cast<arm_compute::NEDepthwiseConvolutionLayer3x3*>(
+ m_pDepthwiseConvolutionLayer.get())->configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+ }
+ else
+ {
+ m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
+ static_cast<arm_compute::NEDepthwiseConvolutionLayer*>(
+ m_pDepthwiseConvolutionLayer.get())->configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+ }
+
+ BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
+
+ InitializeArmComputeTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight);
+
+ if (m_BiasTensor)
+ {
+ InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias);
+ }
+
+ m_pDepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
+}
+
+void NeonDepthwiseConvolutionFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionFloatWorkload_Execute");
+ BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
+
+ m_pDepthwiseConvolutionLayer->run();
+}
+
+void NeonDepthwiseConvolutionFloatWorkload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.hpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.hpp
new file mode 100644
index 0000000000..0109ea10cb
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.hpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonDepthwiseConvolutionFloatWorkload : public FloatWorkload<DepthwiseConvolution2dQueueDescriptor>
+{
+public:
+ NeonDepthwiseConvolutionFloatWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer;
+
+ std::unique_ptr<arm_compute::Tensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp
new file mode 100644
index 0000000000..722b778eba
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp
@@ -0,0 +1,93 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonDepthwiseConvolutionUint8Workload.hpp"
+#include <backends/neon/NeonLayerSupport.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Uint8Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
+{
+ const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
+
+ m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo, descriptor.m_DataLayout);
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), descriptor.m_DataLayout);
+ }
+
+ arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
+ m_Data.m_Parameters.m_StrideY,
+ m_Data.m_Parameters.m_PadLeft,
+ m_Data.m_Parameters.m_PadRight,
+ m_Data.m_Parameters.m_PadTop,
+ m_Data.m_Parameters.m_PadBottom,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ m_Data.ValidateInputsOutputs("NeonDepthwiseConvolutionUint8Workload", 1, 1);
+
+ arm_compute::ITensor& input = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ bool use3x3Optimisation = weightInfo.GetShape()[3] == 3 && weightInfo.GetShape()[2] == 3;
+ if (use3x3Optimisation)
+ {
+ m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
+ static_cast<arm_compute::NEDepthwiseConvolutionLayer3x3*>(
+ m_pDepthwiseConvolutionLayer.get())->configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+ }
+ else
+ {
+ m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
+ static_cast<arm_compute::NEDepthwiseConvolutionLayer*>(
+ m_pDepthwiseConvolutionLayer.get())->configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+ }
+
+ BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
+
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>());
+
+ if (m_BiasTensor)
+ {
+ InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor<int32_t>());
+ }
+
+ m_pDepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
+}
+
+void NeonDepthwiseConvolutionUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionUint8Workload_Execute");
+ BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
+
+ m_pDepthwiseConvolutionLayer->run();
+}
+
+void NeonDepthwiseConvolutionUint8Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.hpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.hpp
new file mode 100644
index 0000000000..90cf8b0091
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonDepthwiseConvolutionUint8Workload : public Uint8Workload<DepthwiseConvolution2dQueueDescriptor>
+{
+public:
+ NeonDepthwiseConvolutionUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer;
+
+ std::unique_ptr<arm_compute::Tensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
new file mode 100644
index 0000000000..a08ba8a6ec
--- /dev/null
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonFloorFloatWorkload.hpp"
+
+namespace armnn
+{
+NeonFloorFloatWorkload::NeonFloorFloatWorkload(const FloorQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<FloorQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonFloorFloatWorkload", 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output);
+}
+
+void NeonFloorFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFloorFloatWorkload_Execute");
+ m_Layer.run();
+}
+} //namespace armnn
+
+
+
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
new file mode 100644
index 0000000000..478aa94ca4
--- /dev/null
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonFloorFloatWorkload : public FloatWorkload<FloorQueueDescriptor>
+{
+public:
+ NeonFloorFloatWorkload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEFloor m_Layer;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
new file mode 100644
index 0000000000..8cebb4f48f
--- /dev/null
+++ b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
@@ -0,0 +1,110 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonFullyConnectedWorkload.hpp"
+
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights);
+
+ arm_compute::TensorInfo aclBiases;
+ arm_compute::TensorInfo *optionalAclBiases = nullptr;
+ if (descriptor.m_BiasEnabled)
+ {
+ aclBiases = BuildArmComputeTensorInfo(biases);
+ optionalAclBiases = &aclBiases;
+ }
+
+ const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo =
+ ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor);
+
+
+ return arm_compute::NEFullyConnectedLayer::validate(&aclInput,
+ &aclWeights,
+ optionalAclBiases,
+ &aclOutput,
+ fullyConnectedLayerInfo);
+}
+
+NeonFullyConnectedWorkload::NeonFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
+ , m_FullyConnectedLayer(memoryManager)
+{
+ m_Data.ValidateInputsOutputs("NeonFullyConnectedWorkload", 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_WeightsTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo());
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasesTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo());
+ }
+
+ // Construct
+ arm_compute::FullyConnectedLayerInfo fc_info;
+ fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix;
+ m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
+
+ // Allocate
+ if (m_Data.m_Weight->GetTensorInfo().GetDataType() == DataType::QuantisedAsymm8)
+ {
+ InitialiseArmComputeTensorData(*m_WeightsTensor, m_Data.m_Weight->GetConstTensor<uint8_t>());
+ }
+ else
+ {
+ InitializeArmComputeTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight);
+ }
+
+ if (m_BiasesTensor)
+ {
+ if (m_Data.m_Bias->GetTensorInfo().GetDataType() == DataType::Signed32)
+ {
+ InitialiseArmComputeTensorData(*m_BiasesTensor, m_Data.m_Bias->GetConstTensor<int32_t>());
+ }
+ else
+ {
+ InitializeArmComputeTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias);
+ }
+ }
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_FullyConnectedLayer.prepare();
+ FreeUnusedTensors();
+}
+
+void NeonFullyConnectedWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFullyConnectedWorkload_Execute");
+ m_FullyConnectedLayer.run();
+}
+
+void NeonFullyConnectedWorkload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_WeightsTensor);
+ FreeTensorIfUnused(m_BiasesTensor);
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp b/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp
new file mode 100644
index 0000000000..9ffac96a86
--- /dev/null
+++ b/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <memory>
+
+namespace armnn
+{
+
+arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor);
+
+class NeonFullyConnectedWorkload : public BaseWorkload<FullyConnectedQueueDescriptor>
+{
+public:
+ NeonFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEFullyConnectedLayer m_FullyConnectedLayer;
+
+ std::unique_ptr<arm_compute::Tensor> m_WeightsTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasesTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
new file mode 100644
index 0000000000..dee789af85
--- /dev/null
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonL2NormalizationFloatWorkload.hpp"
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ arm_compute::NormalizationLayerInfo normalizationInfo =
+ CreateAclNormalizationLayerInfoForL2Normalization(input);
+
+ return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo);
+}
+
+NeonL2NormalizationFloatWorkload::NeonL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info)
+ , m_Layer(memoryManager)
+{
+ m_Data.ValidateInputsOutputs("NeonL2NormalizationFloatWorkload", 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ m_Layer.configure(&input, &output, CreateAclNormalizationLayerInfoForL2Normalization(info.m_InputTensorInfos[0]));
+}
+
+void NeonL2NormalizationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonL2NormalizationFloatWorkload_Execute");
+ m_Layer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
new file mode 100644
index 0000000000..c1221fb98c
--- /dev/null
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
@@ -0,0 +1,32 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <memory>
+
+namespace armnn
+{
+
+arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output);
+
+class NeonL2NormalizationFloatWorkload : public FloatWorkload<L2NormalizationQueueDescriptor>
+{
+public:
+ NeonL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ virtual void Execute() const override;
+
+private:
+ // Purposely not a NEL2Normalize function. See constructor.
+ mutable arm_compute::NENormalizationLayer m_Layer;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
new file mode 100644
index 0000000000..8b2b58d9b1
--- /dev/null
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
@@ -0,0 +1,22 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonLstmFloatWorkload.hpp"
+
+namespace armnn
+{
+NeonLstmFloatWorkload::NeonLstmFloatWorkload(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<LstmQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonLstmFloatWorkload", 1, 1);
+}
+
+void NeonLstmFloatWorkload::Execute() const
+{
+ throw armnn::Exception("No implementation of Lstm in the Neon backend!");
+}
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
new file mode 100644
index 0000000000..4a5394f0a0
--- /dev/null
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonLstmFloatWorkload : public FloatWorkload<LstmQueueDescriptor>
+{
+public:
+ NeonLstmFloatWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonMergerFloatWorkload.cpp b/src/backends/neon/workloads/NeonMergerFloatWorkload.cpp
new file mode 100644
index 0000000000..79039aa51a
--- /dev/null
+++ b/src/backends/neon/workloads/NeonMergerFloatWorkload.cpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonMergerFloatWorkload.hpp"
+
+namespace armnn
+{
+
+void NeonMergerFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerFloatWorkload_Execute");
+ NeonBaseMergerWorkload::Execute();
+}
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonMergerFloatWorkload.hpp b/src/backends/neon/workloads/NeonMergerFloatWorkload.hpp
new file mode 100644
index 0000000000..e7088b8c2f
--- /dev/null
+++ b/src/backends/neon/workloads/NeonMergerFloatWorkload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseMergerWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonMergerFloatWorkload : public NeonBaseMergerWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ using NeonBaseMergerWorkload<DataType::Float16, DataType::Float32>::NeonBaseMergerWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonMergerUint8Workload.cpp b/src/backends/neon/workloads/NeonMergerUint8Workload.cpp
new file mode 100644
index 0000000000..3989702bd3
--- /dev/null
+++ b/src/backends/neon/workloads/NeonMergerUint8Workload.cpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonMergerUint8Workload.hpp"
+
+namespace armnn
+{
+
+void NeonMergerUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerUint8Workload_Execute");
+ NeonBaseMergerWorkload::Execute();
+}
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonMergerUint8Workload.hpp b/src/backends/neon/workloads/NeonMergerUint8Workload.hpp
new file mode 100644
index 0000000000..73c0fd55ad
--- /dev/null
+++ b/src/backends/neon/workloads/NeonMergerUint8Workload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseMergerWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonMergerUint8Workload : public NeonBaseMergerWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ using NeonBaseMergerWorkload<DataType::QuantisedAsymm8>::NeonBaseMergerWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp
new file mode 100644
index 0000000000..c4241ece19
--- /dev/null
+++ b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp
@@ -0,0 +1,60 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonMultiplicationFloatWorkload.hpp"
+
+
+namespace armnn
+{
+
+arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it,
+ // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be
+ // ignored for F32 tensors.
+ return arm_compute::NEPixelWiseMultiplication::validate(&aclInput1,
+ &aclInput2,
+ &aclOutput,
+ 1.0f,
+ arm_compute::ConvertPolicy::SATURATE,
+ arm_compute::RoundingPolicy::TO_ZERO);
+}
+
+NeonMultiplicationFloatWorkload::NeonMultiplicationFloatWorkload(const MultiplicationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<MultiplicationQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonMultiplicationFloatWorkload", 2, 1);
+
+ arm_compute::ITensor& input1 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& input2 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it,
+ // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be
+ // ignored for F32 tensors.
+ m_PixelWiseMultiplication.configure(&input1,
+ &input2,
+ &output,
+ 1.0f,
+ arm_compute::ConvertPolicy::SATURATE,
+ arm_compute::RoundingPolicy::TO_ZERO);
+}
+
+void NeonMultiplicationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMultiplicationFloatWorkload_Execute");
+ m_PixelWiseMultiplication.run();
+}
+
+} //namespace armnn
+
+
diff --git a/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp
new file mode 100644
index 0000000000..0a99c8cedc
--- /dev/null
+++ b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
+
+class NeonMultiplicationFloatWorkload : public FloatWorkload<MultiplicationQueueDescriptor>
+{
+public:
+ NeonMultiplicationFloatWorkload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEPixelWiseMultiplication m_PixelWiseMultiplication;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
new file mode 100644
index 0000000000..472c75f222
--- /dev/null
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
@@ -0,0 +1,70 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonNormalizationFloatWorkload.hpp"
+#include <backends/neon/NeonLayerSupport.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ arm_compute::NormalizationLayerInfo normalizationInfo =
+ armcomputetensorutils::BuildArmComputeNormalizationLayerInfo(descriptor);
+
+ return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo);
+}
+
+NeonNormalizationFloatWorkload::NeonNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info)
+ , m_NormalizationLayer(memoryManager)
+{
+ m_Data.ValidateInputsOutputs("NeonNormalizationFloatWorkload", 1, 1);
+ std::string reasonIfUnsupported;
+ if (!IsNeonNormalizationDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters))
+ {
+ throw UnimplementedException(reasonIfUnsupported);
+ }
+
+ // Input and output tensors have to have the same dimensionality.
+ if (info.m_InputTensorInfos[0].GetShape()[1] != info.m_OutputTensorInfos[0].GetShape()[1]
+ || info.m_InputTensorInfos[0].GetShape()[0] != info.m_OutputTensorInfos[0].GetShape()[0]
+ || info.m_InputTensorInfos[0].GetShape()[3] != info.m_OutputTensorInfos[0].GetShape()[3]
+ || info.m_InputTensorInfos[0].GetShape()[2] != info.m_OutputTensorInfos[0].GetShape()[2])
+ {
+ throw InvalidArgumentException("Normalization requires input and output tensors to have equal dimensionality.");
+ }
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ const arm_compute::NormType normType =
+ ConvertNormalizationAlgorithmChannelToAclNormType(m_Data.m_Parameters.m_NormChannelType);
+ arm_compute::NormalizationLayerInfo normalizationInfo(normType,
+ m_Data.m_Parameters.m_NormSize,
+ m_Data.m_Parameters.m_Alpha,
+ m_Data.m_Parameters.m_Beta,
+ m_Data.m_Parameters.m_K,
+ false);
+
+ m_NormalizationLayer.configure(&input, &output, normalizationInfo);
+}
+
+void NeonNormalizationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonNormalizationFloatWorkload_Execute");
+ m_NormalizationLayer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
new file mode 100644
index 0000000000..c6f64c6c15
--- /dev/null
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+namespace armnn
+{
+
+arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor);
+
+class NeonNormalizationFloatWorkload : public FloatWorkload<NormalizationQueueDescriptor>
+{
+public:
+ NeonNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NENormalizationLayer m_NormalizationLayer;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonPermuteWorkload.cpp b/src/backends/neon/workloads/NeonPermuteWorkload.cpp
new file mode 100644
index 0000000000..0bf4aa1319
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPermuteWorkload.cpp
@@ -0,0 +1,54 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonPermuteWorkload.hpp"
+#include <backends/neon/NeonTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/core/Error.h>
+
+namespace armnn
+{
+
+arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const PermuteDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+ const armnn::PermutationVector& mappings = descriptor.m_DimMappings;
+
+ return arm_compute::NEPermute::validate(&aclInputInfo, &aclOutputInfo,
+ armcomputetensorutils::BuildArmComputePermutationVector(mappings));
+}
+
+template <armnn::DataType... DataTypes>
+NeonPermuteWorkload<DataTypes...>::NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : TypedWorkload<PermuteQueueDescriptor, DataTypes...>(descriptor, info)
+{
+ using armcomputetensorutils::BuildArmComputePermutationVector;
+
+ m_Data.ValidateInputsOutputs(GetName(), 1, 1);
+
+ const arm_compute::ITensor& input = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
+
+ // Run the layer.
+ m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings));
+}
+
+template <armnn::DataType... DataTypes>
+void NeonPermuteWorkload<DataTypes...>::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON(GetName() + "_Execute");
+ m_PermuteFunction.run();
+}
+
+template class NeonPermuteWorkload<DataType::Float16, DataType::Float32>;
+template class NeonPermuteWorkload<DataType::QuantisedAsymm8>;
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonPermuteWorkload.hpp b/src/backends/neon/workloads/NeonPermuteWorkload.hpp
new file mode 100644
index 0000000000..a85816be38
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPermuteWorkload.hpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <backends/WorkloadData.hpp>
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+#include <armnn/TypesUtils.hpp>
+#include <arm_compute/runtime/NEON/functions/NEPermute.h>
+
+#include <string>
+
+namespace armnn
+{
+arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input, const TensorInfo& output,
+ const PermuteDescriptor& descriptor);
+
+template <armnn::DataType... DataTypes>
+class NeonPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataTypes...>
+{
+public:
+ static const std::string& GetName()
+ {
+ static const std::string name = std::string("NeonPermuteWorkload");
+ return name;
+ }
+
+ NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ using TypedWorkload<PermuteQueueDescriptor, DataTypes...>::m_Data;
+ mutable arm_compute::NEPermute m_PermuteFunction;
+};
+
+using NeonPermuteFloatWorkload = NeonPermuteWorkload<DataType::Float16, DataType::Float32>;
+using NeonPermuteUint8Workload = NeonPermuteWorkload<DataType::QuantisedAsymm8>;
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonPooling2dBaseWorkload.cpp b/src/backends/neon/workloads/NeonPooling2dBaseWorkload.cpp
new file mode 100644
index 0000000000..109e856506
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPooling2dBaseWorkload.cpp
@@ -0,0 +1,47 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonPooling2dBaseWorkload.hpp"
+#include <backends/neon/NeonLayerSupport.hpp>
+#include <backends/neon/NeonTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(descriptor);
+
+ return arm_compute::NEPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo);
+}
+
+template <armnn::DataType... dataTypes>
+NeonPooling2dBaseWorkload<dataTypes...>::NeonPooling2dBaseWorkload(
+ const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name)
+ : TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs(name, 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters);
+
+ m_PoolingLayer.configure(&input, &output, layerInfo);
+}
+
+template class NeonPooling2dBaseWorkload<DataType::Float16, DataType::Float32>;
+template class NeonPooling2dBaseWorkload<DataType::QuantisedAsymm8>;
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonPooling2dBaseWorkload.hpp b/src/backends/neon/workloads/NeonPooling2dBaseWorkload.hpp
new file mode 100644
index 0000000000..8ea41fe18a
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPooling2dBaseWorkload.hpp
@@ -0,0 +1,37 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor);
+
+// Base class template providing an implementation of the Pooling2d layer common to all data types.
+template <armnn::DataType... dataTypes>
+class NeonPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>
+{
+public:
+ using TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>::m_Data;
+
+ NeonPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ const std::string& name);
+
+protected:
+ mutable arm_compute::NEPoolingLayer m_PoolingLayer;
+};
+
+
+} //namespace armnn
+
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonPooling2dFloatWorkload.cpp b/src/backends/neon/workloads/NeonPooling2dFloatWorkload.cpp
new file mode 100644
index 0000000000..46996b088c
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPooling2dFloatWorkload.cpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonPooling2dFloatWorkload.hpp"
+
+
+
+namespace armnn
+{
+
+NeonPooling2dFloatWorkload::NeonPooling2dFloatWorkload(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : NeonPooling2dBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>(descriptor, info,
+ "NeonPooling2dFloatWorkload")
+{
+}
+
+void NeonPooling2dFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dFloatWorkload_Execute");
+ m_PoolingLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonPooling2dFloatWorkload.hpp b/src/backends/neon/workloads/NeonPooling2dFloatWorkload.hpp
new file mode 100644
index 0000000000..9b0eebdc2b
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPooling2dFloatWorkload.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include "NeonPooling2dBaseWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonPooling2dFloatWorkload : public NeonPooling2dBaseWorkload<armnn::DataType::Float16,
+ armnn::DataType::Float32>
+{
+public:
+ NeonPooling2dFloatWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
+
+
+
diff --git a/src/backends/neon/workloads/NeonPooling2dUint8Workload.cpp b/src/backends/neon/workloads/NeonPooling2dUint8Workload.cpp
new file mode 100644
index 0000000000..8f99a2be86
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPooling2dUint8Workload.cpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonPooling2dUint8Workload.hpp"
+
+
+
+namespace armnn
+{
+
+NeonPooling2dUint8Workload::NeonPooling2dUint8Workload(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : NeonPooling2dBaseWorkload<armnn::DataType::QuantisedAsymm8>(descriptor, info, "NeonPooling2dUint8Workload")
+{
+}
+
+void NeonPooling2dUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dUint8Workload_Execute");
+ m_PoolingLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonPooling2dUint8Workload.hpp b/src/backends/neon/workloads/NeonPooling2dUint8Workload.hpp
new file mode 100644
index 0000000000..d475c5f721
--- /dev/null
+++ b/src/backends/neon/workloads/NeonPooling2dUint8Workload.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/Types.hpp>
+#include "NeonPooling2dBaseWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonPooling2dUint8Workload : public NeonPooling2dBaseWorkload<armnn::DataType::QuantisedAsymm8>
+{
+public:
+ NeonPooling2dUint8Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonReshapeFloatWorkload.cpp b/src/backends/neon/workloads/NeonReshapeFloatWorkload.cpp
new file mode 100644
index 0000000000..2dae9466bb
--- /dev/null
+++ b/src/backends/neon/workloads/NeonReshapeFloatWorkload.cpp
@@ -0,0 +1,32 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonReshapeFloatWorkload.hpp"
+
+
+
+namespace armnn
+{
+
+NeonReshapeFloatWorkload::NeonReshapeFloatWorkload(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<ReshapeQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonReshapeFloatWorkload", 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output);
+}
+
+void NeonReshapeFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeFloatWorkload_Execute");
+ m_Layer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonReshapeFloatWorkload.hpp b/src/backends/neon/workloads/NeonReshapeFloatWorkload.hpp
new file mode 100644
index 0000000000..bdef862419
--- /dev/null
+++ b/src/backends/neon/workloads/NeonReshapeFloatWorkload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonReshapeFloatWorkload : public FloatWorkload<ReshapeQueueDescriptor>
+{
+public:
+ NeonReshapeFloatWorkload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEReshapeLayer m_Layer;
+};
+
+} //namespace armnn
+
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonReshapeUint8Workload.cpp b/src/backends/neon/workloads/NeonReshapeUint8Workload.cpp
new file mode 100644
index 0000000000..41aa07fe49
--- /dev/null
+++ b/src/backends/neon/workloads/NeonReshapeUint8Workload.cpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonReshapeUint8Workload.hpp"
+
+
+
+
+namespace armnn
+{
+NeonReshapeUint8Workload::NeonReshapeUint8Workload(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Uint8Workload<ReshapeQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonReshapeUint8Workload", 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output);
+}
+
+void NeonReshapeUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeUint8Workload_Execute");
+ m_Layer.run();
+}
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonReshapeUint8Workload.hpp b/src/backends/neon/workloads/NeonReshapeUint8Workload.hpp
new file mode 100644
index 0000000000..4951873f0b
--- /dev/null
+++ b/src/backends/neon/workloads/NeonReshapeUint8Workload.hpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonReshapeUint8Workload : public Uint8Workload<ReshapeQueueDescriptor>
+{
+public:
+ NeonReshapeUint8Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEReshapeLayer m_Layer;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp
new file mode 100644
index 0000000000..0e11d8249f
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonSoftmaxBaseWorkload.hpp"
+
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor)
+{
+ // NOTE: We report 4D Softmax as unsupported until full support is added to ACL
+ if(input.GetShape().GetNumDimensions() >= 4u)
+ {
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported");
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::NESoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo, descriptor.m_Beta);
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp
new file mode 100644
index 0000000000..446392cd03
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor);
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
new file mode 100644
index 0000000000..92e5139c1a
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
@@ -0,0 +1,32 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonSoftmaxFloatWorkload.hpp"
+
+namespace armnn
+{
+
+NeonSoftmaxFloatWorkload::NeonSoftmaxFloatWorkload(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info)
+ , m_SoftmaxLayer(memoryManager)
+{
+ m_Data.ValidateInputsOutputs("NeonSoftmaxFloatWorkload", 1, 1);
+
+ // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions.
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_SoftmaxLayer.configure(&input, &output, m_Data.m_Parameters.m_Beta);
+}
+
+void NeonSoftmaxFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxFloatWorkload_Execute");
+ m_SoftmaxLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp
new file mode 100644
index 0000000000..83f29222eb
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <memory>
+
+namespace armnn
+{
+
+class NeonSoftmaxFloatWorkload : public FloatWorkload<SoftmaxQueueDescriptor>
+{
+public:
+ NeonSoftmaxFloatWorkload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NESoftmaxLayer m_SoftmaxLayer;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
new file mode 100644
index 0000000000..cff869c9b7
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
@@ -0,0 +1,41 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonSoftmaxUint8Workload.hpp"
+
+namespace armnn
+{
+
+NeonSoftmaxUint8Workload::NeonSoftmaxUint8Workload(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : Uint8Workload<SoftmaxQueueDescriptor>(descriptor, info)
+ , m_SoftmaxLayer(memoryManager)
+{
+ m_Data.ValidateInputsOutputs("NeonSoftmaxUint8Workload", 1, 1);
+
+ arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ const auto outputQuantization = output.info()->quantization_info();
+
+ if ((outputQuantization.scale != (1.0f / 256.0f)) || (outputQuantization.offset != 0))
+ {
+ throw InvalidArgumentException(
+ "Invalid quantization for output. Only scale = 1.0f / 256.0f and offset = 0 supported");
+ }
+
+ m_SoftmaxLayer.configure(&input, &output, descriptor.m_Parameters.m_Beta);
+}
+
+void NeonSoftmaxUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxUint8Workload_Execute");
+
+ m_SoftmaxLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp
new file mode 100644
index 0000000000..0d72514ec0
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+namespace armnn
+{
+
+class NeonSoftmaxUint8Workload : public Uint8Workload<SoftmaxQueueDescriptor>
+{
+public:
+ NeonSoftmaxUint8Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NESoftmaxLayer m_SoftmaxLayer;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonSplitterFloatWorkload.cpp b/src/backends/neon/workloads/NeonSplitterFloatWorkload.cpp
new file mode 100644
index 0000000000..39ed5b7cbc
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSplitterFloatWorkload.cpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonSplitterFloatWorkload.hpp"
+
+namespace armnn
+{
+
+void NeonSplitterFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterFloatWorkload_Execute");
+ NeonBaseSplitterWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSplitterFloatWorkload.hpp b/src/backends/neon/workloads/NeonSplitterFloatWorkload.hpp
new file mode 100644
index 0000000000..744a4fe216
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSplitterFloatWorkload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseSplitterWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonSplitterFloatWorkload : public NeonBaseSplitterWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ using NeonBaseSplitterWorkload<DataType::Float16, DataType::Float32>::NeonBaseSplitterWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSplitterUint8Workload.cpp b/src/backends/neon/workloads/NeonSplitterUint8Workload.cpp
new file mode 100644
index 0000000000..4b2cf8fc91
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSplitterUint8Workload.cpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonSplitterUint8Workload.hpp"
+
+namespace armnn
+{
+
+void NeonSplitterUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterUint8Workload_Execute");
+ NeonBaseSplitterWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSplitterUint8Workload.hpp b/src/backends/neon/workloads/NeonSplitterUint8Workload.hpp
new file mode 100644
index 0000000000..f219cfaa7d
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSplitterUint8Workload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseSplitterWorkload.hpp"
+
+namespace armnn
+{
+
+class NeonSplitterUint8Workload : public NeonBaseSplitterWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ using NeonBaseSplitterWorkload<DataType::QuantisedAsymm8>::NeonBaseSplitterWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp
new file mode 100644
index 0000000000..2acb829e3d
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp
@@ -0,0 +1,46 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonSubtractionFloatWorkload.hpp"
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonSubtractionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::NEArithmeticSubtraction::validate(&aclInput0,
+ &aclInput1,
+ &aclOutput,
+ arm_compute::ConvertPolicy::SATURATE);
+}
+
+NeonSubtractionFloatWorkload::NeonSubtractionFloatWorkload(const SubtractionQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<SubtractionQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonSubtractionFloatWorkload", 2, 1);
+
+ arm_compute::ITensor& input1 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ITensor& input2 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_SubLayer.configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE);
+}
+
+void NeonSubtractionFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSubtractionFloatWorkload_Execute");
+ m_SubLayer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp
new file mode 100644
index 0000000000..98aeb4cfc5
--- /dev/null
+++ b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status NeonSubtractionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
+
+class NeonSubtractionFloatWorkload : public FloatWorkload<SubtractionQueueDescriptor>
+{
+public:
+ NeonSubtractionFloatWorkload(const SubtractionQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::NEArithmeticSubtraction m_SubLayer;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloadUtils.cpp b/src/backends/neon/workloads/NeonWorkloadUtils.cpp
new file mode 100644
index 0000000000..195f090171
--- /dev/null
+++ b/src/backends/neon/workloads/NeonWorkloadUtils.cpp
@@ -0,0 +1,60 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "NeonWorkloadUtils.hpp"
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/neon/NeonTensorHandle.hpp>
+#include <backends/neon/NeonLayerSupport.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include <armnn/Utils.hpp>
+#include <armnn/Exceptions.hpp>
+
+#include <cstring>
+#include <boost/assert.hpp>
+#include <boost/cast.hpp>
+#include <boost/format.hpp>
+
+#include "Profiling.hpp"
+
+#include <armnn/Types.hpp>
+#include <Half.hpp>
+
+using namespace armnn::armcomputetensorutils;
+
+namespace armnn
+{
+
+// Allocates a tensor and copy the contents in data to the tensor contents.
+template<typename T>
+void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data)
+{
+ InitialiseArmComputeTensorEmpty(tensor);
+ CopyArmComputeITensorData(data, tensor);
+}
+
+template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const Half* data);
+template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const float* data);
+template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const uint8_t* data);
+template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const int32_t* data);
+
+void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor,
+ const ConstCpuTensorHandle* handle)
+{
+ BOOST_ASSERT(handle);
+ switch(handle->GetTensorInfo().GetDataType())
+ {
+ case DataType::Float16:
+ InitialiseArmComputeTensorData(tensor, handle->GetConstTensor<Half>());
+ break;
+ case DataType::Float32:
+ InitialiseArmComputeTensorData(tensor, handle->GetConstTensor<float>());
+ break;
+ default:
+ BOOST_ASSERT_MSG(false, "Unexpected floating point type.");
+ }
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloadUtils.hpp b/src/backends/neon/workloads/NeonWorkloadUtils.hpp
new file mode 100644
index 0000000000..22668f6f4b
--- /dev/null
+++ b/src/backends/neon/workloads/NeonWorkloadUtils.hpp
@@ -0,0 +1,34 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <backends/neon/NeonTensorHandle.hpp>
+
+#include "NeonTimer.hpp"
+
+#include <arm_compute/core/Types.h>
+#include <arm_compute/core/Helpers.h>
+#include <arm_compute/runtime/NEON/NEFunctions.h>
+#include <arm_compute/runtime/SubTensor.h>
+
+#include <boost/cast.hpp>
+
+namespace armnn
+{
+class Layer;
+
+template<typename T>
+void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data);
+
+void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor, const ConstCpuTensorHandle* handle);
+} //namespace armnn
+
+
+#define ARMNN_SCOPED_PROFILING_EVENT_NEON(name) \
+ ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::CpuAcc, \
+ name, \
+ armnn::NeonTimer(), \
+ armnn::WallClockTimer())
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
new file mode 100644
index 0000000000..a4ab6b2cac
--- /dev/null
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -0,0 +1,41 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+#include "NeonActivationFloatWorkload.hpp"
+#include "NeonActivationUint8Workload.hpp"
+#include "NeonAdditionFloatWorkload.hpp"
+#include "NeonBaseConstantWorkload.hpp"
+#include "NeonBaseMergerWorkload.hpp"
+#include "NeonBaseSplitterWorkload.hpp"
+#include "NeonBatchNormalizationFloatWorkload.hpp"
+#include "NeonConstantFloatWorkload.hpp"
+#include "NeonConstantUint8Workload.hpp"
+#include "NeonConvertFp16ToFp32Workload.hpp"
+#include "NeonConvertFp32ToFp16Workload.hpp"
+#include "NeonConvolution2dBaseWorkload.hpp"
+#include "NeonConvolution2dFloatWorkload.hpp"
+#include "NeonConvolution2dUint8Workload.hpp"
+#include "NeonDepthwiseConvolutionFloatWorkload.hpp"
+#include "NeonDepthwiseConvolutionUint8Workload.hpp"
+#include "NeonFloorFloatWorkload.hpp"
+#include "NeonFullyConnectedWorkload.hpp"
+#include "NeonL2NormalizationFloatWorkload.hpp"
+#include "NeonLstmFloatWorkload.hpp"
+#include "NeonMergerFloatWorkload.hpp"
+#include "NeonMergerUint8Workload.hpp"
+#include "NeonMultiplicationFloatWorkload.hpp"
+#include "NeonNormalizationFloatWorkload.hpp"
+#include "NeonPermuteWorkload.hpp"
+#include "NeonPooling2dBaseWorkload.hpp"
+#include "NeonPooling2dFloatWorkload.hpp"
+#include "NeonPooling2dUint8Workload.hpp"
+#include "NeonReshapeFloatWorkload.hpp"
+#include "NeonReshapeUint8Workload.hpp"
+#include "NeonSoftmaxFloatWorkload.hpp"
+#include "NeonSoftmaxUint8Workload.hpp"
+#include "NeonSplitterFloatWorkload.hpp"
+#include "NeonSplitterUint8Workload.hpp"
+#include "NeonSubtractionFloatWorkload.hpp"