aboutsummaryrefslogtreecommitdiff
path: root/src/armnn/backends
diff options
context:
space:
mode:
Diffstat (limited to 'src/armnn/backends')
-rw-r--r--src/armnn/backends/AclBaseMemoryManager.cpp32
-rw-r--r--src/armnn/backends/AclBaseMemoryManager.hpp46
-rw-r--r--src/armnn/backends/ArmComputeTensorUtils.cpp29
-rw-r--r--src/armnn/backends/ArmComputeTensorUtils.hpp97
-rw-r--r--src/armnn/backends/ArmComputeUtils.hpp12
-rw-r--r--src/armnn/backends/ClContextControl.cpp61
-rw-r--r--src/armnn/backends/ClContextControl.hpp14
-rw-r--r--src/armnn/backends/ClLayerSupport.cpp222
-rw-r--r--src/armnn/backends/ClLayerSupport.hpp39
-rw-r--r--src/armnn/backends/ClTensorHandle.hpp84
-rw-r--r--src/armnn/backends/ClWorkloadFactory.cpp110
-rw-r--r--src/armnn/backends/ClWorkloadFactory.hpp29
-rw-r--r--src/armnn/backends/ClWorkloadUtils.hpp30
-rw-r--r--src/armnn/backends/ClWorkloads.hpp6
-rw-r--r--src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp25
-rw-r--r--src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp7
-rw-r--r--src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp14
-rw-r--r--src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp71
-rw-r--r--src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp29
-rw-r--r--src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp41
-rw-r--r--src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp18
-rw-r--r--src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp18
-rw-r--r--src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp20
-rw-r--r--src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp20
-rw-r--r--src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp6
-rw-r--r--src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp10
-rw-r--r--src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp10
-rw-r--r--src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp74
-rw-r--r--src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp22
-rw-r--r--src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp64
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp28
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp64
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp28
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp36
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp10
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp33
-rw-r--r--src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp8
-rw-r--r--src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp122
-rw-r--r--src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp37
-rw-r--r--src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp22
-rw-r--r--src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp17
-rw-r--r--src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp91
-rw-r--r--src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp22
-rw-r--r--src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp16
-rw-r--r--src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp70
-rw-r--r--src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp19
-rw-r--r--src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp16
-rw-r--r--src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp5
-rw-r--r--src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp405
-rw-r--r--src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp67
-rw-r--r--src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp26
-rw-r--r--src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp9
-rw-r--r--src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp16
-rw-r--r--src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp13
-rw-r--r--src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp10
-rw-r--r--src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp8
-rw-r--r--src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp28
-rw-r--r--src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp16
-rw-r--r--src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp2
-rw-r--r--src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp4
-rw-r--r--src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp2
-rw-r--r--src/armnn/backends/CpuTensorHandle.cpp6
-rw-r--r--src/armnn/backends/CpuTensorHandle.hpp41
-rw-r--r--src/armnn/backends/ITensorHandle.hpp48
-rw-r--r--src/armnn/backends/MakeWorkloadHelper.hpp19
-rw-r--r--src/armnn/backends/MemCopyWorkload.cpp223
-rw-r--r--src/armnn/backends/MemCopyWorkload.hpp120
-rw-r--r--src/armnn/backends/NeonLayerSupport.cpp242
-rw-r--r--src/armnn/backends/NeonLayerSupport.hpp39
-rw-r--r--src/armnn/backends/NeonTensorHandle.hpp73
-rw-r--r--src/armnn/backends/NeonWorkloadFactory.cpp110
-rw-r--r--src/armnn/backends/NeonWorkloadFactory.hpp29
-rw-r--r--src/armnn/backends/NeonWorkloadUtils.cpp21
-rw-r--r--src/armnn/backends/NeonWorkloadUtils.hpp9
-rw-r--r--src/armnn/backends/NeonWorkloads.hpp3
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp27
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp7
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp13
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp20
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp7
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp25
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp11
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp11
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp75
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp20
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp4
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp41
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp26
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp43
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp26
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp69
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp13
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp7
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp8
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp46
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp19
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp41
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp8
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp39
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp6
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp67
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp15
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp16
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp5
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp22
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp20
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp4
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp23
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp5
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp23
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp6
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp16
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp13
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp8
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp8
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp5
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp3
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp30
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp17
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp6
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp2
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp4
-rw-r--r--src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp2
-rw-r--r--src/armnn/backends/OutputHandler.cpp8
-rw-r--r--src/armnn/backends/OutputHandler.hpp21
-rw-r--r--src/armnn/backends/RefLayerSupport.cpp99
-rw-r--r--src/armnn/backends/RefLayerSupport.hpp38
-rw-r--r--src/armnn/backends/RefWorkloadFactory.cpp61
-rw-r--r--src/armnn/backends/RefWorkloadFactory.hpp22
-rw-r--r--src/armnn/backends/RefWorkloads.hpp3
-rw-r--r--src/armnn/backends/RefWorkloads/Activation.cpp2
-rw-r--r--src/armnn/backends/RefWorkloads/Activation.hpp2
-rw-r--r--src/armnn/backends/RefWorkloads/Broadcast.hpp2
-rw-r--r--src/armnn/backends/RefWorkloads/ConvImpl.cpp2
-rw-r--r--src/armnn/backends/RefWorkloads/ConvImpl.hpp26
-rw-r--r--src/armnn/backends/RefWorkloads/FullyConnected.cpp6
-rw-r--r--src/armnn/backends/RefWorkloads/FullyConnected.hpp2
-rw-r--r--src/armnn/backends/RefWorkloads/Merger.hpp14
-rw-r--r--src/armnn/backends/RefWorkloads/Pooling2d.cpp8
-rw-r--r--src/armnn/backends/RefWorkloads/Pooling2d.hpp2
-rw-r--r--src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp2
-rw-r--r--src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp15
-rw-r--r--src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp9
-rw-r--r--src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp23
-rw-r--r--src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp9
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp25
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp21
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp29
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp21
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp13
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp8
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp15
-rw-r--r--src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp9
-rw-r--r--src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp13
-rw-r--r--src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp8
-rw-r--r--src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp16
-rw-r--r--src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp7
-rw-r--r--src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp10
-rw-r--r--src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp7
-rw-r--r--src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp16
-rw-r--r--src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp7
-rw-r--r--src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp16
-rw-r--r--src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp21
-rw-r--r--src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp4
-rw-r--r--src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp1
-rw-r--r--src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp13
-rw-r--r--src/armnn/backends/RefWorkloads/ResizeBilinear.cpp22
-rw-r--r--src/armnn/backends/RefWorkloads/Softmax.cpp8
-rw-r--r--src/armnn/backends/RefWorkloads/Softmax.hpp2
-rw-r--r--src/armnn/backends/RefWorkloads/Splitter.hpp8
-rw-r--r--src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp2
-rw-r--r--src/armnn/backends/Workload.hpp81
-rw-r--r--src/armnn/backends/WorkloadData.cpp69
-rw-r--r--src/armnn/backends/WorkloadData.hpp96
-rw-r--r--src/armnn/backends/WorkloadFactory.cpp418
-rw-r--r--src/armnn/backends/WorkloadFactory.hpp23
-rw-r--r--src/armnn/backends/WorkloadUtils.hpp139
-rw-r--r--src/armnn/backends/test/ActivationFixture.hpp2
-rw-r--r--src/armnn/backends/test/ActivationTestImpl.hpp27
-rw-r--r--src/armnn/backends/test/ArmComputeCl.cpp48
-rw-r--r--src/armnn/backends/test/ArmComputeNeon.cpp156
-rw-r--r--src/armnn/backends/test/BatchNormTestImpl.hpp6
-rw-r--r--src/armnn/backends/test/ClContextControlFixture.hpp21
-rw-r--r--src/armnn/backends/test/Conv2dTestImpl.hpp52
-rw-r--r--src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp55
-rw-r--r--src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp55
-rw-r--r--src/armnn/backends/test/CreateWorkloadCl.cpp340
-rw-r--r--src/armnn/backends/test/CreateWorkloadNeon.cpp270
-rw-r--r--src/armnn/backends/test/CreateWorkloadRef.cpp219
-rw-r--r--src/armnn/backends/test/FullyConnectedTestImpl.hpp8
-rw-r--r--src/armnn/backends/test/IsLayerSupportedTest.cpp178
-rw-r--r--src/armnn/backends/test/IsLayerSupportedTestImpl.hpp167
-rw-r--r--src/armnn/backends/test/LayerReleaseConstantDataTest.cpp212
-rw-r--r--src/armnn/backends/test/LayerTests.cpp166
-rw-r--r--src/armnn/backends/test/LayerTests.hpp25
-rw-r--r--src/armnn/backends/test/LstmTestImpl.hpp1150
-rw-r--r--src/armnn/backends/test/MemCopyTests.cpp24
-rw-r--r--src/armnn/backends/test/NormTestImpl.hpp4
-rw-r--r--src/armnn/backends/test/Pooling2dTestImpl.hpp14
-rw-r--r--src/armnn/backends/test/QuantizeHelper.hpp2
-rw-r--r--src/armnn/backends/test/Reference.cpp26
-rw-r--r--src/armnn/backends/test/SoftmaxTestImpl.hpp2
-rw-r--r--src/armnn/backends/test/SplitterTestImpl.hpp40
-rw-r--r--src/armnn/backends/test/TensorCopyUtils.cpp11
-rw-r--r--src/armnn/backends/test/WorkloadDataValidation.cpp71
240 files changed, 7311 insertions, 2079 deletions
diff --git a/src/armnn/backends/AclBaseMemoryManager.cpp b/src/armnn/backends/AclBaseMemoryManager.cpp
deleted file mode 100644
index fc796995c7..0000000000
--- a/src/armnn/backends/AclBaseMemoryManager.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// See LICENSE file in the project root for full license information.
-//
-#include "AclBaseMemoryManager.hpp"
-
-namespace armnn
-{
-
-#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED
-AclBaseMemoryManager::AclBaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc)
-{
- // (re)create the memory manager components
- m_Allocator = std::move(alloc);
- m_IntraLayerLifetimeMgr = std::make_shared<arm_compute::BlobLifetimeManager>();
- m_IntraLayerPoolMgr = std::make_shared<arm_compute::PoolManager>();
- m_IntraLayerMemoryMgr = std::make_shared<arm_compute::MemoryManagerOnDemand>(m_IntraLayerLifetimeMgr,
- m_IntraLayerPoolMgr);
-}
-
-void AclBaseMemoryManager::Finalize()
-{
- // Set allocator that the memory manager will use
- m_IntraLayerMemoryMgr->set_allocator(m_Allocator.get());
- // Number of pools that the manager will create. This specifies how many layers you want to run in parallel
- m_IntraLayerMemoryMgr->set_num_pools(1);
- // Finalize the memory manager. (Validity checks, memory allocations, etc)
- m_IntraLayerMemoryMgr->finalize();
-}
-#endif
-
-}
diff --git a/src/armnn/backends/AclBaseMemoryManager.hpp b/src/armnn/backends/AclBaseMemoryManager.hpp
deleted file mode 100644
index 74b596fe97..0000000000
--- a/src/armnn/backends/AclBaseMemoryManager.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// See LICENSE file in the project root for full license information.
-//
-#pragma once
-
-#include "WorkloadFactory.hpp"
-
-#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED
-#include "arm_compute/runtime/IAllocator.h"
-#include "arm_compute/runtime/BlobLifetimeManager.h"
-#include "arm_compute/runtime/MemoryManagerOnDemand.h"
-#include "arm_compute/runtime/PoolManager.h"
-
-#include <memory>
-#endif
-
-namespace armnn
-{
-
-// ARM Compute Base Memory Manager
-class AclBaseMemoryManager
-{
-public:
-
- AclBaseMemoryManager() { }
- virtual ~AclBaseMemoryManager() { }
-
-#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED
- AclBaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc);
-
- void Finalize();
-
- std::shared_ptr<arm_compute::MemoryManagerOnDemand>& Get() { return m_IntraLayerMemoryMgr; }
-
-protected:
-
- mutable std::unique_ptr<arm_compute::IAllocator> m_Allocator;
- mutable std::shared_ptr<arm_compute::BlobLifetimeManager> m_IntraLayerLifetimeMgr;
- mutable std::shared_ptr<arm_compute::PoolManager> m_IntraLayerPoolMgr;
- mutable std::shared_ptr<arm_compute::MemoryManagerOnDemand> m_IntraLayerMemoryMgr;
-#endif
-
-};
-
-} //namespace armnn
diff --git a/src/armnn/backends/ArmComputeTensorUtils.cpp b/src/armnn/backends/ArmComputeTensorUtils.cpp
index f88ed2b4c3..8e4abaf67a 100644
--- a/src/armnn/backends/ArmComputeTensorUtils.cpp
+++ b/src/armnn/backends/ArmComputeTensorUtils.cpp
@@ -16,23 +16,17 @@ arm_compute::DataType GetArmComputeDataType(armnn::DataType dataType)
{
switch(dataType)
{
+ case armnn::DataType::Float16:
+ return arm_compute::DataType::F16;
case armnn::DataType::Float32:
- {
return arm_compute::DataType::F32;
- }
case armnn::DataType::QuantisedAsymm8:
- {
return arm_compute::DataType::QASYMM8;
- }
case armnn::DataType::Signed32:
- {
return arm_compute::DataType::S32;
- }
default:
- {
BOOST_ASSERT_MSG(false, "Unknown data type");
return arm_compute::DataType::UNKNOWN;
- }
}
}
@@ -40,15 +34,15 @@ arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& te
{
arm_compute::TensorShape shape;
- // armnn tensors are (batch, channels, height, width)
- // arm_compute tensors are (width, height, channels, batch)
+ // armnn tensors are (batch, channels, height, width).
+ // arm_compute tensors are (width, height, channels, batch).
for (unsigned int i = 0; i < tensorShape.GetNumDimensions(); i++)
{
- // note that our dimensions are stored in the opposite order to ACL's
+ // Note that our dimensions are stored in the opposite order to ACL's.
shape.set(tensorShape.GetNumDimensions() - i - 1, tensorShape[i]);
// TensorShape::set() flattens leading ones, so that batch size 1 cannot happen.
- // arm_compute tensors expect this
+ // arm_compute tensors expect this.
}
// prevent arm_compute issue where tensor is flattened to nothing
@@ -80,11 +74,18 @@ arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDes
using arm_compute::PoolingLayerInfo;
using arm_compute::Size2D;
- // Resolve ARM Compute layer parameters
+ // Resolve ARM Compute layer parameters.
const PoolingType poolingType = ConvertPoolingAlgorithmToAclPoolingType(descriptor.m_PoolType);
+
+ bool isGlobalPooling = (descriptor.m_StrideX==0 && descriptor.m_StrideY==0);
+ //use specific constructor if global pooling
+ if(isGlobalPooling)
+ {
+ return arm_compute::PoolingLayerInfo(poolingType);
+ }
+
const DimensionRoundingType rounding = ConvertOutputShapeRoundingToAclDimensionRoundingType(
descriptor.m_OutputShapeRounding);
-
const PadStrideInfo padStrideInfo(descriptor.m_StrideX,
descriptor.m_StrideY,
descriptor.m_PadLeft,
diff --git a/src/armnn/backends/ArmComputeTensorUtils.hpp b/src/armnn/backends/ArmComputeTensorUtils.hpp
index 84547f9c80..81c6620a01 100644
--- a/src/armnn/backends/ArmComputeTensorUtils.hpp
+++ b/src/armnn/backends/ArmComputeTensorUtils.hpp
@@ -20,26 +20,26 @@ class ITensorHandle;
namespace armcomputetensorutils
{
-/// Utility function to map an armnn::DataType to corresponding arm_compute::DataType
+/// Utility function to map an armnn::DataType to corresponding arm_compute::DataType.
arm_compute::DataType GetArmComputeDataType(armnn::DataType dataType);
-/// Utility function used to setup an arm_compute::TensorShape object from an armnn::TensorShape
+/// Utility function used to setup an arm_compute::TensorShape object from an armnn::TensorShape.
arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& tensorShape);
/// Utility function used to setup an arm_compute::ITensorInfo object whose dimensions are based on the given
-/// armnn::ITensorInfo
+/// armnn::ITensorInfo.
arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo);
-/// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor
+/// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor.
arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor);
-/// Utility function to setup an arm_compute::NormalizationLayerInfo object from an armnn::NormalizationDescriptor
+/// Utility function to setup an arm_compute::NormalizationLayerInfo object from an armnn::NormalizationDescriptor.
arm_compute::NormalizationLayerInfo BuildArmComputeNormalizationLayerInfo(const NormalizationDescriptor& desc);
-/// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector
+/// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector.
arm_compute::PermutationVector BuildArmComputePermutationVector(const armnn::PermutationVector& vector);
-/// Utility function used to setup an arm_compute::PadStrideInfo object from an armnn layer descriptor
+/// Utility function used to setup an arm_compute::PadStrideInfo object from an armnn layer descriptor.
template <typename Descriptor>
arm_compute::PadStrideInfo BuildArmComputePadStrideInfo(const Descriptor &descriptor)
{
@@ -65,6 +65,16 @@ void InitialiseArmComputeTensorEmpty(Tensor& tensor)
tensor.allocator()->allocate();
}
+/// Utility function to free unused tensors after a workload is configured and prepared
+template <typename Tensor>
+void FreeTensorIfUnused(std::unique_ptr<Tensor>& tensor)
+{
+ if (tensor && !tensor->is_used())
+ {
+ tensor.reset(nullptr);
+ }
+}
+
// Helper function to obtain byte offset into tensor data
inline size_t GetTensorOffset(const arm_compute::ITensorInfo& info,
uint32_t batchIndex,
@@ -73,14 +83,14 @@ inline size_t GetTensorOffset(const arm_compute::ITensorInfo& info,
uint32_t x)
{
arm_compute::Coordinates coords;
- coords.set(3, boost::numeric_cast<int>(batchIndex));
- coords.set(2, boost::numeric_cast<int>(channelIndex));
- coords.set(1, boost::numeric_cast<int>(y));
- coords.set(0, boost::numeric_cast<int>(x));
+ coords.set(3, static_cast<int>(batchIndex));
+ coords.set(2, static_cast<int>(channelIndex));
+ coords.set(1, static_cast<int>(y));
+ coords.set(0, static_cast<int>(x));
return info.offset_element_in_bytes(coords);
}
-// Helper function to obtain element offset into data buffer representing tensor data (assuming no strides)
+// Helper function to obtain element offset into data buffer representing tensor data (assuming no strides).
inline size_t GetLinearBufferOffset(const arm_compute::ITensorInfo& info,
uint32_t batchIndex,
uint32_t channelIndex,
@@ -88,25 +98,25 @@ inline size_t GetLinearBufferOffset(const arm_compute::ITensorInfo& info,
uint32_t x)
{
const arm_compute::TensorShape& shape = info.tensor_shape();
- uint32_t width = boost::numeric_cast<uint32_t>(shape[0]);
- uint32_t height = boost::numeric_cast<uint32_t>(shape[1]);
- uint32_t numChannels = boost::numeric_cast<uint32_t>(shape[2]);
+ uint32_t width = static_cast<uint32_t>(shape[0]);
+ uint32_t height = static_cast<uint32_t>(shape[1]);
+ uint32_t numChannels = static_cast<uint32_t>(shape[2]);
return ((batchIndex * numChannels + channelIndex) * height + y) * width + x;
}
template <typename T>
void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData)
{
- // if MaxNumOfTensorDimensions is increased, this loop will need fixing
+ // If MaxNumOfTensorDimensions is increased, this loop will need fixing.
static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyArmComputeITensorData");
{
const arm_compute::ITensorInfo& info = *srcTensor.info();
const arm_compute::TensorShape& shape = info.tensor_shape();
const uint8_t* const bufferPtr = srcTensor.buffer();
- uint32_t width = boost::numeric_cast<uint32_t>(shape[0]);
- uint32_t height = boost::numeric_cast<uint32_t>(shape[1]);
- uint32_t numChannels = boost::numeric_cast<uint32_t>(shape[2]);
- uint32_t numBatches = boost::numeric_cast<uint32_t>(shape[3]);
+ uint32_t width = static_cast<uint32_t>(shape[0]);
+ uint32_t height = static_cast<uint32_t>(shape[1]);
+ uint32_t numChannels = static_cast<uint32_t>(shape[2]);
+ uint32_t numBatches = static_cast<uint32_t>(shape[3]);
for (unsigned int batchIndex = 0; batchIndex < numBatches; ++batchIndex)
{
@@ -114,8 +124,8 @@ void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData
{
for (unsigned int y = 0; y < height; ++y)
{
- // Copy one row from arm_compute tensor buffer to linear memory buffer
- // A row is the largest contiguous region we can copy, as the tensor data may be using strides
+ // Copies one row from arm_compute tensor buffer to linear memory buffer.
+ // A row is the largest contiguous region we can copy, as the tensor data may be using strides.
memcpy(dstData + GetLinearBufferOffset(info, batchIndex, channelIndex, y, 0),
bufferPtr + GetTensorOffset(info, batchIndex, channelIndex, y, 0),
width * sizeof(T));
@@ -128,16 +138,16 @@ void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData
template <typename T>
void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor)
{
- // if MaxNumOfTensorDimensions is increased, this loop will need fixing
+ // If MaxNumOfTensorDimensions is increased, this loop will need fixing.
static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyArmComputeITensorData");
{
const arm_compute::ITensorInfo& info = *dstTensor.info();
const arm_compute::TensorShape& shape = info.tensor_shape();
uint8_t* const bufferPtr = dstTensor.buffer();
- uint32_t width = boost::numeric_cast<uint32_t>(shape[0]);
- uint32_t height = boost::numeric_cast<uint32_t>(shape[1]);
- uint32_t numChannels = boost::numeric_cast<uint32_t>(shape[2]);
- uint32_t numBatches = boost::numeric_cast<uint32_t>(shape[3]);
+ uint32_t width = static_cast<uint32_t>(shape[0]);
+ uint32_t height = static_cast<uint32_t>(shape[1]);
+ uint32_t numChannels = static_cast<uint32_t>(shape[2]);
+ uint32_t numBatches = static_cast<uint32_t>(shape[3]);
for (unsigned int batchIndex = 0; batchIndex < numBatches; ++batchIndex)
{
@@ -145,8 +155,8 @@ void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor
{
for (unsigned int y = 0; y < height; ++y)
{
- // Copy one row from linear memory buffer to arm_compute tensor buffer
- // A row is the largest contiguous region we can copy, as the tensor data may be using strides
+ // Copies one row from linear memory buffer to arm_compute tensor buffer.
+ // A row is the largest contiguous region we can copy, as the tensor data may be using strides.
memcpy(bufferPtr + GetTensorOffset(info, batchIndex, channelIndex, y, 0),
srcData + GetLinearBufferOffset(info, batchIndex, channelIndex, y, 0),
width * sizeof(T));
@@ -156,5 +166,34 @@ void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor
}
}
+/// Construct a TensorShape object from an ArmCompute object based on arm_compute::Dimensions.
+/// \tparam ArmComputeType Any type that implements the Dimensions interface
+/// \tparam T Shape value type
+/// \param shapelike An ArmCompute object that implements the Dimensions interface
+/// \param initial A default value to initialise the shape with
+/// \return A TensorShape object filled from the Acl shapelike object.
+template<typename ArmComputeType, typename T>
+TensorShape GetTensorShape(const ArmComputeType& shapelike, T initial)
+{
+ std::vector<unsigned int> s(MaxNumOfTensorDimensions, initial);
+ for (unsigned int i=0; i < shapelike.num_dimensions(); ++i)
+ {
+ s[(shapelike.num_dimensions()-1)-i] = boost::numeric_cast<unsigned int>(shapelike[i]);
+ }
+ return TensorShape(boost::numeric_cast<unsigned int>(shapelike.num_dimensions()), s.data());
+};
+
+/// Get the strides from an ACL strides object
+inline TensorShape GetStrides(const arm_compute::Strides& strides)
+{
+ return GetTensorShape(strides, 0U);
+}
+
+/// Get the shape from an ACL shape object
+inline TensorShape GetShape(const arm_compute::TensorShape& shape)
+{
+ return GetTensorShape(shape, 1U);
+}
+
} // namespace armcomputetensorutils
} // namespace armnn
diff --git a/src/armnn/backends/ArmComputeUtils.hpp b/src/armnn/backends/ArmComputeUtils.hpp
index c451e6434b..3c57fb59b7 100644
--- a/src/armnn/backends/ArmComputeUtils.hpp
+++ b/src/armnn/backends/ArmComputeUtils.hpp
@@ -36,7 +36,7 @@ CreateAclNormalizationLayerInfoForL2Normalization(const armnn::TensorInfo& tenso
// For the reference implementation, to make alpha_ become 1, we'd have to use alpha = normSize instead.
const float alpha = 1.0f;
- // Don't offset the reduction
+ // Don't offset the reduction.
const float kappa = 0.0f;
// pow(reduction, -0.5) = 1 / sqrt(reduction)
@@ -53,7 +53,7 @@ ConvertActivationFunctionToAclActivationFunction(ActivationFunction armnnFunctio
switch (armnnFunction)
{
case ActivationFunction::Linear: return AclActivationFunction::LINEAR;
- // Arm compute's 'logistic' function is non-parameterized, so it is exactly a sigmoid function
+ // Arm compute's 'logistic' function is non-parameterized, so it is exactly a sigmoid function.
case ActivationFunction::Sigmoid: return AclActivationFunction::LOGISTIC;
case ActivationFunction::ReLu: return AclActivationFunction::RELU;
case ActivationFunction::BoundedReLu: return AclActivationFunction::LU_BOUNDED_RELU;
@@ -112,6 +112,14 @@ ConvertNormalizationAlgorithmChannelToAclNormType(NormalizationAlgorithmChannel
}
}
+inline arm_compute::FullyConnectedLayerInfo
+ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(const FullyConnectedDescriptor& fullyConnectedDesc)
+{
+ arm_compute::FullyConnectedLayerInfo fc_info;
+ fc_info.transpose_weights = fullyConnectedDesc.m_TransposeWeightMatrix;
+ return fc_info;
+}
+
}
#endif // ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED
diff --git a/src/armnn/backends/ClContextControl.cpp b/src/armnn/backends/ClContextControl.cpp
index f086328e55..68e878da79 100644
--- a/src/armnn/backends/ClContextControl.cpp
+++ b/src/armnn/backends/ClContextControl.cpp
@@ -16,6 +16,7 @@
#include <boost/format.hpp>
#include <boost/log/trivial.hpp>
#include <boost/polymorphic_cast.hpp>
+#include <boost/core/ignore_unused.hpp>
#include "LeakChecking.hpp"
@@ -29,22 +30,27 @@ class Device;
namespace armnn
{
-ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters)
+ClContextControl::ClContextControl(IGpuAccTunedParameters* clTunedParameters,
+ bool profilingEnabled)
: m_clTunedParameters(boost::polymorphic_downcast<ClTunedParameters*>(clTunedParameters))
+ , m_ProfilingEnabled(profilingEnabled)
{
+ // Ignore m_ProfilingEnabled if unused to avoid compiling problems when ArmCompute is disabled.
+ boost::ignore_unused(m_ProfilingEnabled);
+
#ifdef ARMCOMPUTECL_ENABLED
try
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
- // Select default platform as the first element
+ // Selects default platform for the first element.
cl::Platform::setDefault(platforms[0]);
std::vector<cl::Device> devices;
platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
- // Select default device as the first element
+ // Selects default device for the first element.
cl::Device::setDefault(devices[0]);
}
catch (const cl::Error& clError)
@@ -54,15 +60,15 @@ ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters)
) % clError.what() % clError.err()));
}
- // Remove the use of global CL context
+ // Removes the use of global CL context.
cl::Context::setDefault(cl::Context{});
BOOST_ASSERT(cl::Context::getDefault()() == NULL);
- // Remove the use of global CL command queue
+ // Removes the use of global CL command queue.
cl::CommandQueue::setDefault(cl::CommandQueue{});
BOOST_ASSERT(cl::CommandQueue::getDefault()() == NULL);
- // always load the OpenCL runtime
+ // Always load the OpenCL runtime.
LoadOpenClRuntime();
#endif
}
@@ -70,14 +76,14 @@ ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters)
ClContextControl::~ClContextControl()
{
#ifdef ARMCOMPUTECL_ENABLED
- // load the OpencCL runtime without the tuned parameters to free the memory for them
+ // Load the OpencCL runtime without the tuned parameters to free the memory for them.
try
{
UnloadOpenClRuntime();
}
catch (const cl::Error& clError)
{
- // this should not happen, it is ignored if it does
+ // This should not happen, it is ignored if it does.
// Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an
// exception of type std::length_error.
@@ -107,23 +113,23 @@ void ClContextControl::DoLoadOpenClRuntime(bool useTunedParameters)
if (arm_compute::CLScheduler::get().context()() != NULL)
{
- // wait for all queued CL requests to finish before reinitialising it
+ // Wait for all queued CL requests to finish before reinitialising it.
arm_compute::CLScheduler::get().sync();
}
try
{
arm_compute::CLKernelLibrary::get().clear_programs_cache();
- // initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no
+ // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no
// context references); it is initialised again, with a proper context, later.
arm_compute::CLScheduler::get().init(context, commandQueue, device);
arm_compute::CLKernelLibrary::get().init(".", context, device);
{
//
- // Here we replace the context with a new one which in
- // the memory leak checks shows as an extra allocation but
- // because of the scope of the leak check it doesn't count
+ // Here we replace the context with a new one in which
+ // the memory leak checks show it as an extra allocation but
+ // because of the scope of the leak checks, it doesn't count
// the disposal of the original object. On the other hand it
// does count the creation of this context which it flags
// as a memory leak. By adding the following line we prevent
@@ -133,24 +139,19 @@ void ClContextControl::DoLoadOpenClRuntime(bool useTunedParameters)
context = cl::Context(device);
}
- bool enableProfiling = false;
-#if ARMNN_PROFILING_ENABLED
- enableProfiling = true;
-#endif
- if (useTunedParameters &&
- m_clTunedParameters && m_clTunedParameters->m_Mode == IClTunedParameters::Mode::UpdateTunedParameters)
- {
- enableProfiling = true; // Needed for the CLTuner to work.
- }
+ // NOTE: In this specific case profiling has to be enabled on the command queue
+ // in order for the CLTuner to work.
+ bool profilingNeededForClTuner = useTunedParameters && m_clTunedParameters &&
+ m_clTunedParameters->m_Mode == IGpuAccTunedParameters::Mode::UpdateTunedParameters;
- if (enableProfiling)
+ if (m_ProfilingEnabled || profilingNeededForClTuner)
{
- // Create a new queue with profiling enabled
+ // Create a new queue with profiling enabled.
commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
}
else
{
- // Use default queue
+ // Use default queue.
commandQueue = cl::CommandQueue(context, device);
}
}
@@ -178,22 +179,22 @@ void ClContextControl::ClearClCache()
DoLoadOpenClRuntime(true);
}
-armnn::IClTunedParameters* IClTunedParameters::CreateRaw(armnn::IClTunedParameters::Mode mode)
+armnn::IGpuAccTunedParameters* IGpuAccTunedParameters::CreateRaw(armnn::IGpuAccTunedParameters::Mode mode)
{
return new ClTunedParameters(mode);
}
-armnn::IClTunedParametersPtr IClTunedParameters::Create(armnn::IClTunedParameters::Mode mode)
+armnn::IGpuAccTunedParametersPtr IGpuAccTunedParameters::Create(armnn::IGpuAccTunedParameters::Mode mode)
{
- return IClTunedParametersPtr(CreateRaw(mode), &IClTunedParameters::Destroy);
+ return IGpuAccTunedParametersPtr(CreateRaw(mode), &IGpuAccTunedParameters::Destroy);
}
-void IClTunedParameters::Destroy(IClTunedParameters* params)
+void IGpuAccTunedParameters::Destroy(IGpuAccTunedParameters* params)
{
delete params;
}
-ClTunedParameters::ClTunedParameters(armnn::IClTunedParameters::Mode mode)
+ClTunedParameters::ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode)
: m_Mode(mode)
#ifdef ARMCOMPUTECL_ENABLED
, m_Tuner(mode == ClTunedParameters::Mode::UpdateTunedParameters)
diff --git a/src/armnn/backends/ClContextControl.hpp b/src/armnn/backends/ClContextControl.hpp
index 8098e30b75..ee1b797055 100644
--- a/src/armnn/backends/ClContextControl.hpp
+++ b/src/armnn/backends/ClContextControl.hpp
@@ -13,15 +13,16 @@
namespace armnn
{
-class IClTunedParameters;
+class IGpuAccTunedParameters;
class ClTunedParameters;
-// ARM Compute OpenCL context control
+// ARM Compute OpenCL context control.
class ClContextControl
{
public:
- ClContextControl(IClTunedParameters* clTunedParameters = nullptr);
+ ClContextControl(IGpuAccTunedParameters* clTunedParameters = nullptr,
+ bool profilingEnabled = false);
virtual ~ClContextControl();
@@ -31,7 +32,7 @@ public:
// to release the cached memory used by the compute library.
void UnloadOpenClRuntime();
- // Clear the CL cache, without losing the tuned parameter settings
+ // Clear the CL cache, without losing the tuned parameter settings.
void ClearClCache();
private:
@@ -40,12 +41,13 @@ private:
ClTunedParameters* m_clTunedParameters;
+ bool m_ProfilingEnabled;
};
-class ClTunedParameters : public IClTunedParameters
+class ClTunedParameters : public IGpuAccTunedParameters
{
public:
- ClTunedParameters(armnn::IClTunedParameters::Mode mode);
+ ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode);
virtual void Load(const char* filename);
virtual void Save(const char* filename) const;
diff --git a/src/armnn/backends/ClLayerSupport.cpp b/src/armnn/backends/ClLayerSupport.cpp
index 8905adf1fc..72594ac82b 100644
--- a/src/armnn/backends/ClLayerSupport.cpp
+++ b/src/armnn/backends/ClLayerSupport.cpp
@@ -7,7 +7,6 @@
#include "ClLayerSupport.hpp"
#include "InternalTypes.hpp"
-
#include <armnn/Descriptors.hpp>
#include <armnn/Types.hpp>
#include <armnn/Tensor.hpp>
@@ -16,10 +15,21 @@
#ifdef ARMCOMPUTECL_ENABLED
#include "ClWorkloads/ClAdditionFloat32Workload.hpp"
+#include "ClWorkloads/ClActivationFloat32Workload.hpp"
+#include "ClWorkloads/ClBatchNormalizationFloat32Workload.hpp"
+
+#include "ClWorkloads/ClConvertFp16ToFp32Workload.hpp"
+#include "ClWorkloads/ClConvertFp32ToFp16Workload.hpp"
#include "ClWorkloads/ClConvolution2dBaseWorkload.hpp"
+#include "ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp"
+#include "ClWorkloads/ClL2NormalizationFloat32Workload.hpp"
+#include "ClWorkloads/ClMultiplicationFloat32Workload.hpp"
+#include "ClWorkloads/ClFullyConnectedFloat32Workload.hpp"
#include "ClWorkloads/ClPooling2dBaseWorkload.hpp"
#include "ClWorkloads/ClPermuteWorkload.hpp"
#include "ClWorkloads/ClNormalizationFloat32Workload.hpp"
+#include "ClWorkloads/ClSoftmaxBaseWorkload.hpp"
+#include "ClWorkloads/ClLstmFloat32Workload.hpp"
#endif
using namespace boost;
@@ -31,7 +41,7 @@ namespace
template<unsigned int FilterSize>
bool IsMatchingSize2d(const TensorInfo& weightInfo)
{
- // Width & Height must match
+ // Width & Height must match.
return (weightInfo.GetShape()[3] == FilterSize) && (weightInfo.GetShape()[2] == FilterSize);
}
@@ -88,58 +98,10 @@ inline bool IsWorkloadSupported(FuncType&& func, std::string* reasonIfUnsupporte
} //namespace
-bool IsClActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters)
-{
- if (parameters.m_Function != ActivationFunction::BoundedReLu)
- {
- if (reasonIfUnsupported)
- {
- *reasonIfUnsupported = "Unsupported activation function, only BoundedReLu is supported";
- }
-
- return false;
- }
-
- return true;
-}
-
-bool IsClDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported,
- const DepthwiseConvolution2dDescriptor& parameters,
- const TensorInfo& weights)
-{
- if (weights.GetNumDimensions() != 4)
- {
- if (reasonIfUnsupported)
- {
- *reasonIfUnsupported = "Depthwise convolution Weight tensor needs to be 4d";
- }
- return false;
- }
- // weights.GetShape()[0] = channel multiplier
- if (weights.GetShape()[0] != 1)
- {
- if (reasonIfUnsupported)
- {
- *reasonIfUnsupported = "Channel multiplier only supports the value 1 in the CL backend";
- }
- return false;
- }
- else if ((weights.GetDataType() == armnn::DataType::QuantisedAsymm8) && !IsMatchingSize2d<3>(weights))
- {
- if (reasonIfUnsupported)
- {
- *reasonIfUnsupported = "CL backend only supports 3x3 filtering for Depthwise Convolution on 8-bit";
- }
- return false;
- }
-
- return true;
-}
-
-template<typename Float32Func, typename Uint8Func, typename ... Params>
+template<typename FloatFunc, typename Uint8Func, typename ... Params>
bool IsSupportedForDataTypeCl(std::string* reasonIfUnsupported,
DataType dataType,
- Float32Func floatFuncPtr,
+ FloatFunc floatFuncPtr,
Uint8Func uint8FuncPtr,
Params&&... params)
{
@@ -147,19 +109,21 @@ bool IsSupportedForDataTypeCl(std::string* reasonIfUnsupported,
IsSupportedForDataTypeGeneric(reasonIfUnsupported,
dataType,
floatFuncPtr,
+ floatFuncPtr,
uint8FuncPtr,
std::forward<Params>(params)...);
}
bool IsActivationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
const ActivationDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<const ActivationDescriptor&>,
- &IsClActivationUint8Supported,
- descriptor);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClActivationWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor);
}
bool IsAdditionSupportedCl(const TensorInfo& input0,
@@ -167,21 +131,30 @@ bool IsAdditionSupportedCl(const TensorInfo& input0,
const TensorInfo& output,
std::string* reasonIfUnsupported)
{
- return FORWARD_CL_LAYER_SUPPORT_FUNC(ClAdditionFloat32Workload::IsSupported(input0,
+ return FORWARD_CL_LAYER_SUPPORT_FUNC(ClAdditionValidate(input0,
input1,
output,
reasonIfUnsupported));
}
bool IsBatchNormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
const BatchNormalizationDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<const BatchNormalizationDescriptor&>,
- &FalseFuncU8<const BatchNormalizationDescriptor&>,
- descriptor);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClBatchNormalizationValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ mean,
+ var,
+ beta,
+ gamma,
+ descriptor);
}
bool IsConstantSupportedCl(const TensorInfo& output,
@@ -206,20 +179,20 @@ bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convol
bool strideIsOneOrTwo = strideXIsOneOrTwo && strideYIsOneOrTwo;
bool strideIsOneOrTwoOrThree = ( strideXIsOneOrTwo || strideXIsThree ) && ( strideYIsOneOrTwo || strideYIsThree );
- // 1x1 convolution with strides of 1,2,3
+ // 1x1 convolution with strides of 1,2,3.
isSupported |= IsMatchingSize2d<1>(weightInfo) && ( strideIsOneOrTwoOrThree );
- // 3x3 convolution with strides of 1,2
+ // 3x3 convolution with strides of 1,2.
isSupported |= IsMatchingSize2d<3>(weightInfo) && ( strideIsOneOrTwo );
// 5x5 convolution with strides of 1,2
isSupported |= IsMatchingSize2d<5>(weightInfo) && ( strideIsOneOrTwo );
- //fall back to normal convolution for the asymmetric padding case.
+ //Fall back to normal convolution for the asymmetric padding case.
if (desc.m_PadLeft != desc.m_PadRight ||
desc.m_PadTop != desc.m_PadBottom)
{
- //direct convolution does not support asymmetric padding yet.
+ //Direct convolution does not support asymmetric padding yet.
isSupported = false;
}
@@ -250,27 +223,40 @@ bool IsConvolution2dSupportedCl(const TensorInfo& input,
}
bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
const DepthwiseConvolution2dDescriptor& descriptor,
const TensorInfo& weights,
+ const TensorInfo& biases,
std::string* reasonIfUnsupported)
{
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input.GetDataType(),
- &IsClDepthwiseConvolution2dDescParamsSupported,
- &IsClDepthwiseConvolution2dDescParamsSupported,
- descriptor,
- weights);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClDepthwiseConvolutionWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor,
+ weights,
+ biases);
}
bool IsFullyConnectedSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
const FullyConnectedDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
- ignore_unused(descriptor);
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ // At the moment U8 is unsupported
+ if (input.GetDataType() == DataType::QuantisedAsymm8)
+ {
+ return false;
+ }
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClFullyConnectedWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ weights,
+ biases,
+ descriptor);
}
bool IsInputSupportedCl(const TensorInfo& input,
@@ -283,12 +269,10 @@ bool IsInputSupportedCl(const TensorInfo& input,
}
bool IsL2NormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
std::string* reasonIfUnsupported)
{
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output);
}
bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs,
@@ -304,13 +288,14 @@ bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs,
bool IsMultiplicationSupportedCl(const TensorInfo& input0,
const TensorInfo& input1,
+ const TensorInfo& output,
std::string* reasonIfUnsupported)
{
- ignore_unused(input1);
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input0.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClMultiplicationWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
}
bool IsNormalizationSupportedCl(const TensorInfo& input,
@@ -358,14 +343,12 @@ bool IsResizeBilinearSupportedCl(const TensorInfo& input,
}
bool IsSoftmaxSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
const SoftmaxDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
ignore_unused(descriptor);
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &TrueFunc<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClSoftmaxWorkloadValidate, reasonIfUnsupported, input, output);
}
bool IsSplitterSupportedCl(const TensorInfo& input,
@@ -400,10 +383,59 @@ bool IsFloorSupportedCl(const TensorInfo& input,
std::string* reasonIfUnsupported)
{
ignore_unused(output);
- return IsSupportedForDataTypeCl(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ return IsClBackendSupported(reasonIfUnsupported) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ input.GetDataType(),
+ &FalseFuncF16<>,
+ &TrueFunc<>,
+ &FalseFuncU8<>);
+}
+
+bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClLstmFloat32WorkloadValidate, reasonIfUnsupported,
+ input, outputStateIn, cellStateIn, scratchBuffer, outputStateOut, cellStateOut,
+ output, descriptor, inputToForgetWeights, inputToCellWeights,
+ inputToOutputWeights, recurrentToForgetWeights,
+ recurrentToCellWeights, recurrentToOutputWeights,
+ forgetGateBias, cellBias, outputGateBias,
+ inputToInputWeights, recurrentToInputWeights,
+ cellToInputWeights, inputGateBias, projectionWeights,
+ projectionBias, cellToForgetWeights, cellToOutputWeights);
+}
+
+bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp16ToFp32WorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ reasonIfUnsupported);
+}
+
+bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp32ToFp16WorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ reasonIfUnsupported);
}
}
diff --git a/src/armnn/backends/ClLayerSupport.hpp b/src/armnn/backends/ClLayerSupport.hpp
index 4f71e907cf..791e904616 100644
--- a/src/armnn/backends/ClLayerSupport.hpp
+++ b/src/armnn/backends/ClLayerSupport.hpp
@@ -7,16 +7,17 @@
#include <armnn/DescriptorsFwd.hpp>
#include <armnn/Types.hpp>
#include <armnn/Tensor.hpp>
+#include <armnn/ArmNN.hpp>
namespace armnn
{
bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc);
-bool IsClActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters);
bool IsClDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported,
const DepthwiseConvolution2dDescriptor& parameters,
const TensorInfo& weights);
bool IsActivationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
const ActivationDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -26,6 +27,11 @@ bool IsAdditionSupportedCl(const TensorInfo& input0,
std::string* reasonIfUnsupported = nullptr);
bool IsBatchNormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
const BatchNormalizationDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -40,11 +46,16 @@ bool IsConvolution2dSupportedCl(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
const DepthwiseConvolution2dDescriptor& descriptor,
const TensorInfo& weights,
+ const TensorInfo& biases,
std::string* reasonIfUnsupported = nullptr);
bool IsFullyConnectedSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
const FullyConnectedDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -52,14 +63,30 @@ bool IsInputSupportedCl(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsL2NormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
+bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr);
+
bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs,
const OriginsDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
bool IsMultiplicationSupportedCl(const TensorInfo& input0,
const TensorInfo& input1,
+ const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
bool IsNormalizationSupportedCl(const TensorInfo& input,
@@ -84,6 +111,7 @@ bool IsResizeBilinearSupportedCl(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsSoftmaxSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
const SoftmaxDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -101,4 +129,13 @@ bool IsReshapeSupportedCl(const TensorInfo& input,
bool IsFloorSupportedCl(const TensorInfo& input,
const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
}
diff --git a/src/armnn/backends/ClTensorHandle.hpp b/src/armnn/backends/ClTensorHandle.hpp
index 49e18dad59..e3618a3c46 100644
--- a/src/armnn/backends/ClTensorHandle.hpp
+++ b/src/armnn/backends/ClTensorHandle.hpp
@@ -9,9 +9,12 @@
#include <arm_compute/runtime/CL/CLTensor.h>
#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/CL/CLMemoryGroup.h>
+#include <arm_compute/runtime/IMemoryGroup.h>
#include <arm_compute/core/TensorShape.h>
#include <arm_compute/core/Coordinates.h>
+#include <boost/polymorphic_pointer_cast.hpp>
namespace armnn
{
@@ -22,9 +25,8 @@ class IClTensorHandle : public ITensorHandle
public:
virtual arm_compute::ICLTensor& GetTensor() = 0;
virtual arm_compute::ICLTensor const& GetTensor() const = 0;
- virtual void Map(bool blocking = true) = 0;
- virtual void UnMap() = 0;
virtual arm_compute::DataType GetDataType() const = 0;
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) = 0;
};
class ClTensorHandle : public IClTensorHandle
@@ -37,50 +39,98 @@ public:
arm_compute::CLTensor& GetTensor() override { return m_Tensor; }
arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; }
- virtual void Allocate() override {armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);};
+ virtual void Allocate() override {armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);}
- virtual void Map(bool blocking = true) override {m_Tensor.map(blocking);}
- virtual void UnMap() override { m_Tensor.unmap();}
+ virtual void Manage() override
+ {
+ assert(m_MemoryGroup != nullptr);
+ m_MemoryGroup->manage(&m_Tensor);
+ }
- virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL;}
+ virtual const void* Map(bool blocking = true) const override
+ {
+ const_cast<arm_compute::CLTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override { const_cast<arm_compute::CLTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; }
+
+ virtual ITensorHandle* GetParent() const override { return nullptr; }
virtual arm_compute::DataType GetDataType() const override
{
return m_Tensor.info()->data_type();
}
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override
+ {
+ m_MemoryGroup = boost::polymorphic_pointer_downcast<arm_compute::CLMemoryGroup>(memoryGroup);
+ }
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
private:
arm_compute::CLTensor m_Tensor;
-
+ std::shared_ptr<arm_compute::CLMemoryGroup> m_MemoryGroup;
};
class ClSubTensorHandle : public IClTensorHandle
{
public:
- ClSubTensorHandle(arm_compute::ICLTensor& parent,
- const arm_compute::TensorShape& shape,
- const arm_compute::Coordinates& coords)
- : m_Tensor(&parent, shape, coords)
+ ClSubTensorHandle(IClTensorHandle* parent,
+ const arm_compute::TensorShape& shape,
+ const arm_compute::Coordinates& coords)
+ : m_Tensor(&parent->GetTensor(), shape, coords)
{
+ parentHandle = parent;
}
arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; }
arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; }
- virtual void Allocate() override {};
- virtual void Map(bool blocking = true) override {m_Tensor.map(blocking);}
- virtual void UnMap() override { m_Tensor.unmap();}
+ virtual void Allocate() override {}
+ virtual void Manage() override {}
- virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL;}
+ virtual const void* Map(bool blocking = true) const override
+ {
+ const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override { const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; }
+
+ virtual ITensorHandle* GetParent() const override { return parentHandle; }
virtual arm_compute::DataType GetDataType() const override
{
return m_Tensor.info()->data_type();
}
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+
private:
- arm_compute::CLSubTensor m_Tensor;
+ mutable arm_compute::CLSubTensor m_Tensor;
+ ITensorHandle* parentHandle = nullptr;
};
-} \ No newline at end of file
+}
diff --git a/src/armnn/backends/ClWorkloadFactory.cpp b/src/armnn/backends/ClWorkloadFactory.cpp
index 916ca46aae..354440c7bc 100644
--- a/src/armnn/backends/ClWorkloadFactory.cpp
+++ b/src/armnn/backends/ClWorkloadFactory.cpp
@@ -15,9 +15,13 @@
#include <arm_compute/core/CL/CLKernelLibrary.h>
#include <arm_compute/runtime/CL/CLBufferAllocator.h>
#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include "ClWorkloads.hpp"
+
#include "backends/MemCopyWorkload.hpp"
#include "backends/ClTensorHandle.hpp"
-#include "ClWorkloads.hpp"
+
+#include "memory/IPoolManager.hpp"
#endif
#include "MakeWorkloadHelper.hpp"
@@ -29,7 +33,9 @@
namespace armnn
{
-bool ClWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported)
+bool ClWorkloadFactory::IsLayerSupported(const Layer& layer,
+ boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported)
{
return IWorkloadFactory::IsLayerSupported(Compute::GpuAcc, layer, dataType, outReasonIfUnsupported);
}
@@ -43,7 +49,10 @@ ClWorkloadFactory::ClWorkloadFactory()
std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
{
- return std::make_unique<ClTensorHandle>(tensorInfo);
+ std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo);
+ tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup());
+
+ return tensorHandle;
}
std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateSubTensorHandle(ITensorHandle& parent,
@@ -58,24 +67,25 @@ std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateSubTensorHandle(ITensorH
coords.set_num_dimensions(subTensorShape.GetNumDimensions());
for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++)
{
- // arm compute indexes tensor coords in reverse order
+ // Arm compute indexes tensor coords in reverse order.
unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1;
coords.set(i, boost::numeric_cast<int>(subTensorOrigin[revertedIndex]));
}
- return std::make_unique<ClSubTensorHandle>(static_cast<ClTensorHandle&>(parent).GetTensor(), shape, coords);
+ return std::make_unique<ClSubTensorHandle>(
+ boost::polymorphic_downcast<IClTensorHandle*>(&parent), shape, coords);
}
std::unique_ptr<IWorkload> ClWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<CopyFromCpuToClFloat32Workload, CopyFromCpuToClUint8Workload>(descriptor, info);
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<IWorkload> ClWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<CopyFromClToCpuFloat32Workload, CopyFromClToCpuUint8Workload>(descriptor, info);
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<IWorkload> ClWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
@@ -87,7 +97,8 @@ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateActivation(const ActivationQ
std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<ClSoftmaxFloat32Workload, ClSoftmaxUint8Workload>(descriptor, info, m_MemoryManager.Get());
+ return MakeWorkload<ClSoftmaxFloat32Workload, ClSoftmaxUint8Workload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
@@ -105,13 +116,14 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMerger(const MergerQu
std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateFullyConnected(
const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const
{
- return MakeWorkload<ClFullyConnectedFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get());
+ return MakeWorkload<ClFullyConnectedFloat32Workload, NullWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<ClPermuteFloat32Workload, ClPermuteUint8Workload>(descriptor, info);
+ return MakeWorkload<ClPermuteFloatWorkload, ClPermuteUint8Workload>(descriptor, info);
}
std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
@@ -124,7 +136,7 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateConvolution2d(const C
const WorkloadInfo& info) const
{
return MakeWorkload<ClConvolution2dFloat32Workload, ClConvolution2dUint8Workload>(descriptor, info,
- m_MemoryManager.Get());
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDepthwiseConvolution2d(
@@ -142,7 +154,7 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateNormalization(const N
std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<ClAdditionFloat32Workload, NullWorkload>(descriptor, info);
+ return MakeWorkload<ClAdditionFloat32Workload, ClAdditionUint8Workload>(descriptor, info);
}
std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMultiplication(
@@ -165,21 +177,7 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMemCopy(const MemCopy
throw InvalidArgumentException("ClWorkloadFactory: Invalid null input for MemCopy workload");
}
- // Create a workload that will copy tensor data from the inputs, which can have a number of different formats,
- // to CL tensors.
- switch (descriptor.m_Inputs[0]->GetType())
- {
- case ITensorHandle::Cpu:
- return MakeWorkload<CopyFromCpuToClFloat32Workload, CopyFromCpuToClUint8Workload>(descriptor, info);
-#if ARMCOMPUTENEON_ENABLED
- case ITensorHandle::Neon:
- {
- return MakeWorkload<CopyFromNeonToClFloat32Workload, CopyFromNeonToClUint8Workload>(descriptor, info);
- }
-#endif
- default:
- throw InvalidArgumentException("ClWorkloadFactory: Destination type not supported for MemCopy Workload.");
- }
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateResizeBilinear(
@@ -220,11 +218,41 @@ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFloor(const FloorQueueDescri
return MakeWorkload<ClFloorFloat32Workload, NullWorkload>(descriptor, info);
}
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClLstmFloat32Workload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<ClConvertFp16ToFp32Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<ClConvertFp32ToFp16Workload>(descriptor, info);
+}
+
void ClWorkloadFactory::Finalize()
{
m_MemoryManager.Finalize();
}
+void ClWorkloadFactory::Release()
+{
+ m_MemoryManager.Release();
+}
+
+void ClWorkloadFactory::Acquire()
+{
+ m_MemoryManager.Acquire();
+}
+
#else // #if ARMCOMPUTECL_ENABLED
ClWorkloadFactory::ClWorkloadFactory()
@@ -375,10 +403,38 @@ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFloor(const FloorQueueDescri
return nullptr;
}
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
void ClWorkloadFactory::Finalize()
{
}
+void ClWorkloadFactory::Release()
+{
+}
+
+void ClWorkloadFactory::Acquire()
+{
+}
+
#endif // #if ARMCOMPUTECL_ENABLED
} // namespace armnn
diff --git a/src/armnn/backends/ClWorkloadFactory.hpp b/src/armnn/backends/ClWorkloadFactory.hpp
index 7365fe9aeb..d0786f3fba 100644
--- a/src/armnn/backends/ClWorkloadFactory.hpp
+++ b/src/armnn/backends/ClWorkloadFactory.hpp
@@ -4,14 +4,17 @@
//
#pragma once
-#include "AclBaseMemoryManager.hpp"
#include "OutputHandler.hpp"
+
#include "armnn/IRuntime.hpp"
+#include <boost/optional.hpp>
+
+#include "memory/BaseMemoryManager.hpp"
namespace armnn
{
-// ARM Compute OpenCL workload factory
+// ARM Compute OpenCL workload factory.
class ClWorkloadFactory : public IWorkloadFactory
{
public:
@@ -19,7 +22,8 @@ public:
virtual Compute GetCompute() const override { return Compute::GpuAcc; }
- static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported);
+ static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported);
virtual bool SupportsSubTensors() const override { return true; }
@@ -95,11 +99,26 @@ public:
virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
- void Finalize() override;
+ virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual void Finalize() override;
+
+ virtual void Release() override;
+
+ virtual void Acquire() override;
private:
- mutable AclBaseMemoryManager m_MemoryManager;
+#ifdef ARMCOMPUTECL_ENABLED
+ mutable ClMemoryManager m_MemoryManager;
+#endif
};
} // namespace armnn
diff --git a/src/armnn/backends/ClWorkloadUtils.hpp b/src/armnn/backends/ClWorkloadUtils.hpp
index 549a0bbc25..6b6a18e865 100644
--- a/src/armnn/backends/ClWorkloadUtils.hpp
+++ b/src/armnn/backends/ClWorkloadUtils.hpp
@@ -9,6 +9,15 @@
#include <arm_compute/runtime/CL/CLFunctions.h>
#include <arm_compute/runtime/SubTensor.h>
#include "ArmComputeTensorUtils.hpp"
+#include "OpenClTimer.hpp"
+#include "CpuTensorHandle.hpp"
+#include "Half.hpp"
+
+#define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \
+ ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \
+ name, \
+ armnn::OpenClTimer(), \
+ armnn::WallClockTimer())
namespace armnn
{
@@ -17,12 +26,12 @@ template <typename T>
void CopyArmComputeClTensorData(const T* srcData, arm_compute::CLTensor& dstTensor)
{
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "MapClTensorForWriting");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("MapClTensorForWriting");
dstTensor.map(true);
}
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyToClTensor");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("CopyToClTensor");
armcomputetensorutils::CopyArmComputeITensorData<T>(srcData, dstTensor);
}
@@ -36,4 +45,21 @@ void InitialiseArmComputeClTensorData(arm_compute::CLTensor& clTensor, const T*
CopyArmComputeClTensorData<T>(data, clTensor);
}
+inline void InitializeArmComputeClTensorDataForFloatTypes(arm_compute::CLTensor& clTensor,
+ const ConstCpuTensorHandle *handle)
+{
+ BOOST_ASSERT(handle);
+ switch(handle->GetTensorInfo().GetDataType())
+ {
+ case DataType::Float16:
+ InitialiseArmComputeClTensorData(clTensor, handle->GetConstTensor<armnn::Half>());
+ break;
+ case DataType::Float32:
+ InitialiseArmComputeClTensorData(clTensor, handle->GetConstTensor<float>());
+ break;
+ default:
+ BOOST_ASSERT_MSG(false, "Unexpected floating point type.");
+ }
+};
+
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads.hpp b/src/armnn/backends/ClWorkloads.hpp
index 3b8cf50ace..9f5622a491 100644
--- a/src/armnn/backends/ClWorkloads.hpp
+++ b/src/armnn/backends/ClWorkloads.hpp
@@ -7,6 +7,7 @@
#include "backends/ClWorkloads/ClActivationFloat32Workload.hpp"
#include "backends/ClWorkloads/ClActivationUint8Workload.hpp"
#include "backends/ClWorkloads/ClAdditionFloat32Workload.hpp"
+#include "backends/ClWorkloads/ClAdditionUint8Workload.hpp"
#include "backends/ClWorkloads/ClBaseConstantWorkload.hpp"
#include "backends/ClWorkloads/ClBaseMergerWorkload.hpp"
#include "backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp"
@@ -19,6 +20,7 @@
#include "backends/ClWorkloads/ClFloorFloat32Workload.hpp"
#include "backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp"
#include "backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp"
+#include "backends/ClWorkloads/ClLstmFloat32Workload.hpp"
#include "backends/ClWorkloads/ClMergerFloat32Workload.hpp"
#include "backends/ClWorkloads/ClMergerUint8Workload.hpp"
#include "backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp"
@@ -32,4 +34,6 @@
#include "backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp"
#include "backends/ClWorkloads/ClSoftmaxUint8Workload.hpp"
#include "backends/ClWorkloads/ClSplitterFloat32Workload.hpp"
-#include "backends/ClWorkloads/ClSplitterUint8Workload.hpp" \ No newline at end of file
+#include "backends/ClWorkloads/ClSplitterUint8Workload.hpp"
+#include "backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp"
+#include "backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp"
diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp
index fb5d78425e..f072549cbc 100644
--- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp
@@ -9,10 +9,31 @@
namespace armnn
{
+arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ const arm_compute::ActivationLayerInfo activationLayerInfo =
+ ConvertActivationDescriptorToAclActivationLayerInfo(descriptor);
+
+ if (input.GetDataType() == DataType::QuantisedAsymm8 &&
+ activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR,
+ "CL: Logistic Activations unsupported with QAsymm8 data type."};
+ }
+
+ return arm_compute::CLActivationLayer::validate(&aclInput,
+ &aclOutput,
+ activationLayerInfo);
+}
ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<ActivationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<ActivationQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("ClActivationFloat32Workload", 1, 1);
@@ -26,7 +47,7 @@ ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDe
void ClActivationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationFloat32Workload_Execute");
m_ActivationLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp
index 9bab4202be..9fbfe95856 100644
--- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp
@@ -9,9 +9,12 @@
namespace armnn
{
+arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor);
-// Activation layer execution
-class ClActivationFloat32Workload : public Float32Workload<ActivationQueueDescriptor>
+// Activation layer execution.
+class ClActivationFloat32Workload : public FloatWorkload<ActivationQueueDescriptor>
{
public:
ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp
index 3671dd7187..75ab3d0691 100644
--- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp
@@ -6,6 +6,7 @@
#include "ClActivationUint8Workload.hpp"
#include "backends/ClLayerSupport.hpp"
+#include "backends/ArmComputeUtils.hpp"
#include "backends/ClTensorHandle.hpp"
#include "backends/CpuTensorHandle.hpp"
namespace armnn
@@ -15,15 +16,8 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri
const WorkloadInfo& info)
: Uint8Workload<ActivationQueueDescriptor>(descriptor, info)
{
-
- std::string reasonIfUnsupported;
- if (!IsClActivationUint8Supported(&reasonIfUnsupported, m_Data.m_Parameters))
- {
- throw InvalidArgumentException(reasonIfUnsupported);
- }
-
- // Only BoundedReLu is supported (see IsClActivationUint8Supported)
- arm_compute::ActivationLayerInfo layerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function);
+ arm_compute::ActivationLayerInfo layerInfo(activation,
m_Data.m_Parameters.m_A,
m_Data.m_Parameters.m_B);
@@ -37,7 +31,7 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri
void ClActivationUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationUint8Workload_Execute");
m_ActivationLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp
index 3a9cceb298..449b2d56c5 100644
--- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-// Activation layer execution
+// Activation layer execution.
class ClActivationUint8Workload : public Uint8Workload<ActivationQueueDescriptor>
{
public:
diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp
new file mode 100644
index 0000000000..5dd7bb323a
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp
@@ -0,0 +1,71 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClAdditionBaseWorkload.hpp"
+
+#include "backends/ClTensorHandle.hpp"
+#include "backends/CpuTensorHandle.hpp"
+#include "backends/ArmComputeTensorUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
+template <armnn::DataType... T>
+ClAdditionBaseWorkload<T...>::ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : TypedWorkload<AdditionQueueDescriptor, T...>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClAdditionBaseWorkload", 2, 1);
+
+ arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+ m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy);
+}
+
+template <armnn::DataType... T>
+void ClAdditionBaseWorkload<T...>::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionBaseWorkload_Execute");
+ m_Layer.run();
+}
+
+bool ClAdditionValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ if (input0.GetDataType() == DataType::QuantisedAsymm8)
+ {
+ // Reject quantised addition for the moment (COMPMID-1385)
+ *reasonIfUnsupported = "Quantised Addition not yet supported";
+ return false;
+ }
+
+ const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ const arm_compute::Status aclStatus = arm_compute::CLArithmeticAddition::validate(&aclInput0Info,
+ &aclInput1Info,
+ &aclOutputInfo,
+ g_AclConvertPolicy);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return supported;
+}
+
+} //namespace armnn
+
+template class armnn::ClAdditionBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>;
+template class armnn::ClAdditionBaseWorkload<armnn::DataType::QuantisedAsymm8>;
diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp
new file mode 100644
index 0000000000..fba8a0d457
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+template <armnn::DataType... dataTypes>
+class ClAdditionBaseWorkload : public TypedWorkload<AdditionQueueDescriptor, dataTypes...>
+{
+public:
+ ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLArithmeticAddition m_Layer;
+};
+
+bool ClAdditionValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp
index 153167f172..b69593f5f5 100644
--- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp
@@ -13,45 +13,10 @@ namespace armnn
{
using namespace armcomputetensorutils;
-ClAdditionFloat32Workload::ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : Float32Workload<AdditionQueueDescriptor>(descriptor, info)
-{
- m_Data.ValidateInputsOutputs("ClAdditionFloat32Workload", 2, 1);
-
- arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
- arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
- arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
- m_Layer.configure(&input0, &input1, &output, ms_AclConvertPolicy);
-}
-
void ClAdditionFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClAdditionFloat32Workload_Execute");
- m_Layer.run();
-}
-
-bool ClAdditionFloat32Workload::IsSupported(const TensorInfo& input0,
- const TensorInfo& input1,
- const TensorInfo& output,
- std::string* reasonIfUnsupported)
-{
- const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0);
- const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1);
- const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
-
- const arm_compute::Status aclStatus = decltype(m_Layer)::validate(&aclInput0Info,
- &aclInput1Info,
- &aclOutputInfo,
- ms_AclConvertPolicy);
-
- const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
- if (!supported && reasonIfUnsupported)
- {
- *reasonIfUnsupported = aclStatus.error_description();
- }
-
- return supported;
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionFloat32Workload_Execute");
+ ClAdditionBaseWorkload::Execute();
}
-} //namespace armnn \ No newline at end of file
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp
index 37e50c2c86..7eac485cfe 100644
--- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp
@@ -5,26 +5,16 @@
#pragma once
-#include "backends/ClWorkloadUtils.hpp"
+#include "ClAdditionBaseWorkload.hpp"
namespace armnn
{
-class ClAdditionFloat32Workload : public Float32Workload<AdditionQueueDescriptor>
+class ClAdditionFloat32Workload : public ClAdditionBaseWorkload<DataType::Float16, DataType::Float32>
{
public:
- ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info);
-
+ using ClAdditionBaseWorkload<DataType::Float16, DataType::Float32>::ClAdditionBaseWorkload;
void Execute() const override;
-
- static bool IsSupported(const TensorInfo& input0,
- const TensorInfo& input1,
- const TensorInfo& output,
- std::string* reasonIfUnsupported);
-
-private:
- mutable arm_compute::CLArithmeticAddition m_Layer;
- static constexpr arm_compute::ConvertPolicy ms_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
};
-} //namespace armnn \ No newline at end of file
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp
new file mode 100644
index 0000000000..a72ceca471
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp
@@ -0,0 +1,18 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClAdditionUint8Workload.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+void ClAdditionUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionUint8Workload_Execute");
+ ClAdditionBaseWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp
new file mode 100644
index 0000000000..73ff287e7e
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "ClAdditionBaseWorkload.hpp"
+
+namespace armnn
+{
+
+class ClAdditionUint8Workload : public ClAdditionBaseWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ using ClAdditionBaseWorkload<DataType::QuantisedAsymm8>::ClAdditionBaseWorkload;
+ void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp
index 4b72d92d72..e0bc365053 100644
--- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp
@@ -4,17 +4,19 @@
//
#include "ClBaseConstantWorkload.hpp"
+#include "backends/ArmComputeTensorUtils.hpp"
#include "backends/ClTensorHandle.hpp"
#include "backends/CpuTensorHandle.hpp"
+#include "Half.hpp"
namespace armnn
{
-template class ClBaseConstantWorkload<DataType::Float32>;
+template class ClBaseConstantWorkload<DataType::Float16, DataType::Float32>;
template class ClBaseConstantWorkload<DataType::QuantisedAsymm8>;
-template<armnn::DataType dataType>
-void ClBaseConstantWorkload<dataType>::Execute() const
+template<armnn::DataType... dataTypes>
+void ClBaseConstantWorkload<dataTypes...>::Execute() const
{
// The intermediate tensor held by the corresponding layer output handler can be initialised with the given data
// on the first inference, then reused for subsequent inferences.
@@ -26,15 +28,21 @@ void ClBaseConstantWorkload<dataType>::Execute() const
BOOST_ASSERT(data.m_LayerOutput != nullptr);
arm_compute::CLTensor& output = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetTensor();
+ arm_compute::DataType computeDataType = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetDataType();
- switch (dataType)
+ switch (computeDataType)
{
- case DataType::Float32:
+ case arm_compute::DataType::F16:
+ {
+ CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<Half>(), output);
+ break;
+ }
+ case arm_compute::DataType::F32:
{
CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<float>(), output);
break;
}
- case DataType::QuantisedAsymm8:
+ case arm_compute::DataType::QASYMM8:
{
CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<uint8_t>(), output);
break;
diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp
index 660842f375..7ad7bb93ca 100644
--- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp
@@ -9,12 +9,12 @@
namespace armnn
{
-template <armnn::DataType DataType>
-class ClBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataType>
+template <armnn::DataType... DataTypes>
+class ClBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataTypes...>
{
public:
ClBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
- : TypedWorkload<ConstantQueueDescriptor, DataType>(descriptor, info)
+ : TypedWorkload<ConstantQueueDescriptor, DataTypes...>(descriptor, info)
, m_RanOnce(false)
{
}
diff --git a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp
index 7542c62b47..531e32961b 100644
--- a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp
@@ -10,16 +10,16 @@
namespace armnn
{
-// Base class template providing an implementation of the Merger layer common to all data types
-template <armnn::DataType DataType>
-class ClBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataType>
+// Base class template providing an implementation of the Merger layer common to all data types.
+template <armnn::DataType... DataTypes>
+class ClBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataTypes...>
{
public:
- using TypedWorkload<MergerQueueDescriptor, DataType>::TypedWorkload;
+ using TypedWorkload<MergerQueueDescriptor, DataTypes...>::TypedWorkload;
void Execute() const override
{
- // With subtensors, merger is a no-op
+ // With subtensors, merger is a no-op.
}
};
diff --git a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp
index fef841ced2..8e4f10f9fd 100644
--- a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp
@@ -10,16 +10,16 @@
namespace armnn
{
-// Base class template providing an implementation of the Splitter layer common to all data types
-template <armnn::DataType DataType>
-class ClBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataType>
+// Base class template providing an implementation of the Splitter layer common to all data types.
+template <armnn::DataType... DataTypes>
+class ClBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataTypes...>
{
public:
- using TypedWorkload<SplitterQueueDescriptor, DataType>::TypedWorkload;
+ using TypedWorkload<SplitterQueueDescriptor, DataTypes...>::TypedWorkload;
void Execute() const override
{
- // With subtensors, merger is a no-op
+ // With subtensors, merger is a no-op.
}
};
diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp
index dabd495d59..1849c5d411 100644
--- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp
@@ -7,36 +7,88 @@
#include "backends/ClTensorHandle.hpp"
#include "backends/CpuTensorHandle.hpp"
#include "backends/ArmComputeTensorUtils.hpp"
+#include "backends/ClLayerSupport.hpp"
namespace armnn
{
using namespace armcomputetensorutils;
+arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor &desc)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean);
+ const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var);
+ const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta);
+ const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma);
+
+ return arm_compute::CLBatchNormalizationLayer::validate(&aclInputInfo,
+ &aclOutputInfo,
+ &aclMeanInfo,
+ &aclVarInfo,
+ &aclBetaInfo,
+ &aclGammaInfo,
+ desc.m_Eps);
+}
+
ClBatchNormalizationFloat32Workload::ClBatchNormalizationFloat32Workload(
const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
- : Float32Workload<BatchNormalizationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info)
{
- BuildArmComputeTensor(m_Mean, m_Data.m_Mean->GetTensorInfo());
- BuildArmComputeTensor(m_Variance, m_Data.m_Variance->GetTensorInfo());
- BuildArmComputeTensor(m_Gamma, m_Data.m_Gamma->GetTensorInfo());
- BuildArmComputeTensor(m_Beta, m_Data.m_Beta->GetTensorInfo());
+ m_Mean = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo());
+
+ m_Variance = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo());
+
+ m_Gamma = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo());
+
+ m_Beta = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo());
m_Data.ValidateInputsOutputs("ClBatchNormalizationFloat32Workload", 1, 1);
arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
- m_Layer.configure(&input, &output, &m_Mean, &m_Variance, &m_Beta, &m_Gamma, m_Data.m_Parameters.m_Eps);
- InitialiseArmComputeClTensorData(m_Mean, m_Data.m_Mean->GetConstTensor<float>());
- InitialiseArmComputeClTensorData(m_Variance, m_Data.m_Variance->GetConstTensor<float>());
- InitialiseArmComputeClTensorData(m_Beta, m_Data.m_Beta->GetConstTensor<float>());
- InitialiseArmComputeClTensorData(m_Gamma, m_Data.m_Gamma->GetConstTensor<float>());
+ m_Layer.configure(&input,
+ &output,
+ m_Mean.get(),
+ m_Variance.get(),
+ m_Beta.get(),
+ m_Gamma.get(),
+ m_Data.m_Parameters.m_Eps);
+
+ InitializeArmComputeClTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean);
+ InitializeArmComputeClTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance);
+ InitializeArmComputeClTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta);
+ InitializeArmComputeClTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma);
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_Layer.prepare();
+ FreeUnusedTensors();
}
void ClBatchNormalizationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClBatchNormalizationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClBatchNormalizationFloat32Workload_Execute");
m_Layer.run();
}
+void ClBatchNormalizationFloat32Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_Mean);
+ FreeTensorIfUnused(m_Variance);
+ FreeTensorIfUnused(m_Gamma);
+ FreeTensorIfUnused(m_Beta);
+}
+
} //namespace armnn \ No newline at end of file
diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp
index ddbd0f05c0..a45614a284 100644
--- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp
@@ -10,21 +10,31 @@
namespace armnn
{
-class ClBatchNormalizationFloat32Workload : public Float32Workload<BatchNormalizationQueueDescriptor>
+arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& desc);
+
+class ClBatchNormalizationFloat32Workload : public FloatWorkload<BatchNormalizationQueueDescriptor>
{
public:
ClBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
- using Float32Workload<BatchNormalizationQueueDescriptor>::Float32Workload;
+ using FloatWorkload<BatchNormalizationQueueDescriptor>::FloatWorkload;
void Execute() const override;
private:
mutable arm_compute::CLBatchNormalizationLayer m_Layer;
- arm_compute::CLTensor m_Mean;
- arm_compute::CLTensor m_Variance;
- arm_compute::CLTensor m_Gamma;
- arm_compute::CLTensor m_Beta;
+ std::unique_ptr<arm_compute::CLTensor> m_Mean;
+ std::unique_ptr<arm_compute::CLTensor> m_Variance;
+ std::unique_ptr<arm_compute::CLTensor> m_Gamma;
+ std::unique_ptr<arm_compute::CLTensor> m_Beta;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp
index 99880d68a7..58594999a8 100644
--- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp
@@ -9,7 +9,7 @@ namespace armnn
void ClConstantFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantFloat32Workload_Execute");
ClBaseConstantWorkload::Execute();
}
diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp
index 5f86d3b2b6..11c3fda8db 100644
--- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp
@@ -9,10 +9,10 @@
namespace armnn
{
-class ClConstantFloat32Workload : public ClBaseConstantWorkload<DataType::Float32>
+class ClConstantFloat32Workload : public ClBaseConstantWorkload<DataType::Float16, DataType::Float32>
{
public:
- using ClBaseConstantWorkload<DataType::Float32>::ClBaseConstantWorkload;
+ using ClBaseConstantWorkload<DataType::Float16, DataType::Float32>::ClBaseConstantWorkload;
void Execute() const override;
};
diff --git a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp
index 078d4261fa..82ce436557 100644
--- a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp
@@ -9,7 +9,7 @@ namespace armnn
void ClConstantUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantUint8Workload_Execute");
ClBaseConstantWorkload::Execute();
}
diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp
new file mode 100644
index 0000000000..4914be78bc
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp
@@ -0,0 +1,64 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClConvertFp16ToFp32Workload.hpp"
+#include "backends/ClTensorHandle.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
+ClConvertFp16ToFp32Workload::ClConvertFp16ToFp32Workload(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info) :
+ Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClConvertFp16ToFp32Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output, g_AclConvertPolicy, 0);
+}
+
+void ClConvertFp16ToFp32Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp16ToFp32Workload_Execute");
+ m_Layer.run();
+}
+
+arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ if (input.GetDataType() != DataType::Float16)
+ {
+ *reasonIfUnsupported = "Input should be Float16";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+ if (output.GetDataType() != DataType::Float32)
+ {
+ *reasonIfUnsupported = "Output should be Float32";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate(
+ &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return aclStatus;
+}
+
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp
new file mode 100644
index 0000000000..36ccbb7144
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+class ClConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>
+{
+public:
+
+ ClConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::CLDepthConvertLayer m_Layer;
+};
+
+arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp
new file mode 100644
index 0000000000..19e064351f
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp
@@ -0,0 +1,64 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClConvertFp32ToFp16Workload.hpp"
+#include "backends/ClTensorHandle.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
+ClConvertFp32ToFp16Workload::ClConvertFp32ToFp16Workload(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info) :
+ Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClConvertFp32ToFp16Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output, g_AclConvertPolicy, 0);
+}
+
+void ClConvertFp32ToFp16Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp32ToFp16Workload_Execute");
+ m_Layer.run();
+}
+
+arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ if (input.GetDataType() != DataType::Float32)
+ {
+ *reasonIfUnsupported = "Input should be Float32";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+ if (output.GetDataType() != DataType::Float16)
+ {
+ *reasonIfUnsupported = "Output should be Float16";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate(
+ &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return aclStatus;
+}
+
+
+} //namespace armnn \ No newline at end of file
diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp
new file mode 100644
index 0000000000..02a442dabc
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+class ClConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>
+{
+public:
+
+ ClConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::CLDepthConvertLayer m_Layer;
+};
+
+arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp
index d7aef3d223..9ac31df5c1 100644
--- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp
@@ -15,13 +15,15 @@ using namespace armcomputetensorutils;
ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor,
const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : Float32Workload<Convolution2dQueueDescriptor>(descriptor, info)
+ : FloatWorkload<Convolution2dQueueDescriptor>(descriptor, info)
, m_ConvolutionLayer(memoryManager)
{
- // todo: check tensor shapes match
+ // todo: check tensor shapes match.
const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
- BuildArmComputeTensor(m_KernelTensor, weightInfo);
+
+ m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo);
arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
m_Data.m_Parameters.m_StrideY,
@@ -31,11 +33,10 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution
m_Data.m_Parameters.m_PadBottom,
arm_compute::DimensionRoundingType::FLOOR);
- arm_compute::CLTensor* optionalBias = nullptr;
if (m_Data.m_Parameters.m_BiasEnabled)
{
- BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBias = &m_BiasTensor;
+ m_BiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
}
m_Data.ValidateInputsOutputs("ClConvolution2dFloat32Workload", 1, 1);
@@ -44,24 +45,35 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution
arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
m_ConvolutionLayer.configure(&input,
- &m_KernelTensor,
- optionalBias,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
- InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<float>());
+ InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight);
- if (optionalBias)
+ if (m_BiasTensor)
{
- InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<float>());
+ InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias);
}
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_ConvolutionLayer.prepare();
+ FreeUnusedTensors();
}
void ClConvolution2dFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dFloat32Workload_Execute");
m_ConvolutionLayer.run();
}
+void ClConvolution2dFloat32Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp
index 4cf73c89cc..51c21aec32 100644
--- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp
@@ -14,7 +14,7 @@
namespace armnn
{
-class ClConvolution2dFloat32Workload : public Float32Workload<Convolution2dQueueDescriptor>
+class ClConvolution2dFloat32Workload : public FloatWorkload<Convolution2dQueueDescriptor>
{
public:
ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
@@ -22,10 +22,12 @@ public:
void Execute() const override;
private:
- mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
+ mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
- arm_compute::CLTensor m_KernelTensor;
- arm_compute::CLTensor m_BiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp
index cf419e752e..a78d7fb4a2 100644
--- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp
@@ -18,10 +18,11 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu
: Uint8Workload<Convolution2dQueueDescriptor>(descriptor, info)
, m_ConvolutionLayer(memoryManager)
{
-
// todo: check tensor shapes match
const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
- BuildArmComputeTensor(m_KernelTensor, weightInfo);
+
+ m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo);
arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
m_Data.m_Parameters.m_StrideY,
@@ -31,11 +32,10 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu
m_Data.m_Parameters.m_PadBottom,
arm_compute::DimensionRoundingType::FLOOR);
- arm_compute::CLTensor* optionalBias = nullptr;
if (m_Data.m_Parameters.m_BiasEnabled)
{
- BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBias = &m_BiasTensor;
+ m_BiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
}
m_Data.ValidateInputsOutputs("ClConvolution2dUint8Workload", 1, 1);
@@ -44,25 +44,36 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu
arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
m_ConvolutionLayer.configure(&input,
- &m_KernelTensor,
- optionalBias,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
- InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>());
+ InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>());
- if (optionalBias)
+ if (m_BiasTensor)
{
- InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<int32_t>());
+ InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor<int32_t>());
}
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_ConvolutionLayer.prepare();
+ FreeUnusedTensors();
}
void ClConvolution2dUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dUint8Workload_Execute");
m_ConvolutionLayer.run();
}
+void ClConvolution2dUint8Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp
index d4d3908c80..7d9eb76ba1 100644
--- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp
@@ -22,10 +22,12 @@ public:
void Execute() const override;
private:
- mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
+ mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
- arm_compute::CLTensor m_KernelTensor;
- arm_compute::CLTensor m_BiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp
new file mode 100644
index 0000000000..cfb8485039
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp
@@ -0,0 +1,122 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClDepthwiseConvolutionBaseWorkload.hpp"
+
+#include "TypeUtils.hpp"
+
+#include "backends/ArmComputeUtils.hpp"
+#include "backends/ArmComputeTensorUtils.hpp"
+#include "backends/ClTensorHandle.hpp"
+#include "backends/CpuTensorHandle.hpp"
+
+namespace armnn
+{
+
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const TensorInfo& biases)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights);
+
+ arm_compute::TensorInfo aclBiasesInfo;
+ arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
+ if (descriptor.m_BiasEnabled)
+ {
+ aclBiasesInfo = BuildArmComputeTensorInfo(biases);
+ optionalAclBiasesInfo = &aclBiasesInfo;
+ }
+
+ const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
+ const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+ return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo,
+ &aclWeightsInfo,
+ optionalAclBiasesInfo,
+ &aclOutputInfo,
+ aclPadStrideInfo,
+ aclDepthMultiplier);
+}
+
+template<armnn::DataType... dataTypes>
+ClDepthwiseConvolutionBaseWorkload<dataTypes...>::ClDepthwiseConvolutionBaseWorkload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>(descriptor, info)
+{
+ auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
+
+ m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo);
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
+ }
+
+ arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
+ m_Data.m_Parameters.m_StrideY,
+ m_Data.m_Parameters.m_PadLeft,
+ m_Data.m_Parameters.m_PadRight,
+ m_Data.m_Parameters.m_PadTop,
+ m_Data.m_Parameters.m_PadBottom,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ std::string name = std::string("ClDepthwiseConvolution") +
+ GetDataTypeName(m_Data.m_Weight->GetTensorInfo().GetDataType()) + "Workload";
+ m_Data.ValidateInputsOutputs(name, 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ const unsigned int depthMultiplier = weightInfo.GetShape()[0];
+
+ //Check for optimisation opportunities.
+ bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3);
+ if (use3x3Optimisation)
+ {
+ m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
+ static_cast<arm_compute::CLDepthwiseConvolutionLayer3x3*>(m_DepthwiseConvolutionLayer.get())->configure(
+ &input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo,
+ depthMultiplier);
+ }
+ else
+ {
+ m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
+ static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_DepthwiseConvolutionLayer.get())->configure(
+ &input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo,
+ depthMultiplier);
+ }
+
+ BOOST_ASSERT(m_DepthwiseConvolutionLayer);
+}
+
+template<armnn::DataType... dataTypes>
+void ClDepthwiseConvolutionBaseWorkload<dataTypes...>::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
+// Generate known implementations for linker
+template class ClDepthwiseConvolutionBaseWorkload<DataType::Float16, DataType::Float32>;
+template class ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8>;
+
+} // namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp
new file mode 100644
index 0000000000..a879efc89e
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp
@@ -0,0 +1,37 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const TensorInfo& biases);
+
+template<armnn::DataType... dataTypes>
+class ClDepthwiseConvolutionBaseWorkload : public TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>
+{
+public:
+ using TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>::m_Data;
+
+ ClDepthwiseConvolutionBaseWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+
+protected:
+ std::unique_ptr<arm_compute::IFunction> m_DepthwiseConvolutionLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp
index f31c73bc60..96d97ad4ea 100644
--- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp
@@ -4,8 +4,8 @@
//
#include "ClDepthwiseConvolutionFloat32Workload.hpp"
-#include "ClDepthwiseConvolutionHelper.hpp"
-#include "backends/ClTensorHandle.hpp"
+
+#include "backends/ClWorkloadUtils.hpp"
#include "backends/CpuTensorHandle.hpp"
namespace armnn
@@ -14,17 +14,25 @@ namespace armnn
ClDepthwiseConvolutionFloat32Workload::ClDepthwiseConvolutionFloat32Workload(
const DepthwiseConvolution2dQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
+ : ClDepthwiseConvolutionBaseWorkload(descriptor, info)
{
- InitClDepthwiseConvolutionWorkload(*this);
+ InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight);
+
+ if (m_BiasTensor)
+ {
+ InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias);
+ }
+
+ m_DepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
}
void ClDepthwiseConvolutionFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionFloat32Workload_Execute");
- BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionFloat32Workload_Execute");
+ BOOST_ASSERT(m_DepthwiseConvolutionLayer);
- m_pDepthwiseConvolutionLayer->run();
+ m_DepthwiseConvolutionLayer->run();
}
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp
index 8711f0c515..669fd928b5 100644
--- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp
@@ -5,29 +5,20 @@
#pragma once
+#include "ClDepthwiseConvolutionBaseWorkload.hpp"
+
#include "backends/ClWorkloadUtils.hpp"
namespace armnn
{
-class ClDepthwiseConvolutionFloat32Workload : public Float32Workload<DepthwiseConvolution2dQueueDescriptor>
+class ClDepthwiseConvolutionFloat32Workload : public ClDepthwiseConvolutionBaseWorkload<DataType::Float16,
+ DataType::Float32>
{
public:
ClDepthwiseConvolutionFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
const WorkloadInfo& info);
void Execute() const override;
-
-private:
- typedef float KernelDataType;
- typedef float BiasDataType;
-
- mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer;
-
- arm_compute::CLTensor m_KernelTensor;
- arm_compute::CLTensor m_BiasTensor;
-
- template <typename WorkloadType>
- friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload);
};
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp
deleted file mode 100644
index cd7115773d..0000000000
--- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// See LICENSE file in the project root for full license information.
-//
-
-#pragma once
-
-#include <armnn/TypesUtils.hpp>
-#include "backends/ClLayerSupport.hpp"
-#include "backends/ArmComputeTensorUtils.hpp"
-#include "backends/ClTensorHandle.hpp"
-
-namespace armnn
-{
-
-template <typename WorkloadType>
-void InitClDepthwiseConvolutionWorkload(WorkloadType& workload)
-{
- using T = typename WorkloadType::KernelDataType;
- using B = typename WorkloadType::BiasDataType;
-
- auto& m_Data = workload.GetData();
- auto& m_KernelTensor = workload.m_KernelTensor;
- auto& m_BiasTensor = workload.m_BiasTensor;
- auto& m_pDepthwiseConvolutionLayer = workload.m_pDepthwiseConvolutionLayer;
-
- auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
-
- std::string reasonIfUnsupported;
- if (!IsClDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo))
- {
- throw UnimplementedException(reasonIfUnsupported);
- }
-
- armcomputetensorutils::BuildArmComputeTensor(m_KernelTensor, weightInfo);
-
- arm_compute::CLTensor* optionalBias = nullptr;
- if (m_Data.m_Parameters.m_BiasEnabled)
- {
- armcomputetensorutils::BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBias = &m_BiasTensor;
- }
-
- arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
- m_Data.m_Parameters.m_StrideY,
- m_Data.m_Parameters.m_PadLeft,
- m_Data.m_Parameters.m_PadRight,
- m_Data.m_Parameters.m_PadTop,
- m_Data.m_Parameters.m_PadBottom,
- arm_compute::DimensionRoundingType::FLOOR);
-
- std::string name = std::string("ClDepthwiseConvolution") + GetDataTypeName(GetDataType<T>()) + "Workload";
- m_Data.ValidateInputsOutputs(name, 1, 1);
-
- arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
- arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
-
- //Check for optimisation opportunities.
- bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3);
- if (use3x3Optimisation)
- {
- m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
- static_cast<arm_compute::CLDepthwiseConvolutionLayer3x3*>(m_pDepthwiseConvolutionLayer.get())->configure(
- &input,
- &m_KernelTensor,
- optionalBias,
- &output,
- padStrideInfo);
- }
- else
- {
- m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
- static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_pDepthwiseConvolutionLayer.get())->configure(
- &input,
- &m_KernelTensor,
- optionalBias,
- &output,
- padStrideInfo);
- }
-
- BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
-
- InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->template GetConstTensor<T>());
-
- if (optionalBias)
- {
- InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->template GetConstTensor<B>());
- }
-}
-
-} //namespace armnn \ No newline at end of file
diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp
index 7e7c488c74..4852ce8bf9 100644
--- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp
@@ -4,28 +4,34 @@
//
#include "ClDepthwiseConvolutionUint8Workload.hpp"
-#include "ClDepthwiseConvolutionHelper.hpp"
-#include "backends/ClTensorHandle.hpp"
+
#include "backends/CpuTensorHandle.hpp"
namespace armnn
{
-
ClDepthwiseConvolutionUint8Workload::ClDepthwiseConvolutionUint8Workload(
const DepthwiseConvolution2dQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Uint8Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
+ : ClDepthwiseConvolutionBaseWorkload(descriptor, info)
{
- InitClDepthwiseConvolutionWorkload(*this);
+ InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<uint8_t>());
+
+ if (m_BiasTensor)
+ {
+ InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>());
+ }
+
+ m_DepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
}
void ClDepthwiseConvolutionUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionUint8Workload_Execute");
- BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionUint8Workload_Execute");
+ BOOST_ASSERT(m_DepthwiseConvolutionLayer);
- m_pDepthwiseConvolutionLayer->run();
+ m_DepthwiseConvolutionLayer->run();
}
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp
index ee09ff3e58..a4277d405f 100644
--- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp
@@ -5,29 +5,19 @@
#pragma once
+#include "ClDepthwiseConvolutionBaseWorkload.hpp"
+
#include "backends/ClWorkloadUtils.hpp"
namespace armnn
{
-class ClDepthwiseConvolutionUint8Workload : public Uint8Workload<DepthwiseConvolution2dQueueDescriptor>
+class ClDepthwiseConvolutionUint8Workload : public ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8>
{
public:
ClDepthwiseConvolutionUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
const WorkloadInfo& info);
void Execute() const override;
-
-private:
- typedef uint8_t KernelDataType;
- typedef int32_t BiasDataType;
-
- mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer;
-
- arm_compute::CLTensor m_KernelTensor;
- arm_compute::CLTensor m_BiasTensor;
-
- template <typename WorkloadType>
- friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload);
};
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp
index 882da50855..da71c50305 100644
--- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
{
ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info)
- : Float32Workload<FloorQueueDescriptor>(descriptor, info)
+ : FloatWorkload<FloorQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("ClFloorFloat32Workload", 1, 1);
@@ -22,7 +22,7 @@ ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descr
void ClFloorFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFloorFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClFloorFloat32Workload_Execute");
m_Layer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp
index 532dd29884..bd7f3032fc 100644
--- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-class ClFloorFloat32Workload : public Float32Workload<FloorQueueDescriptor>
+class ClFloorFloat32Workload : public FloatWorkload<FloorQueueDescriptor>
{
public:
ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp
index 5dfab9cbbd..5014dd27ca 100644
--- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp
@@ -7,47 +7,89 @@
#include "backends/ClTensorHandle.hpp"
#include "backends/CpuTensorHandle.hpp"
#include "backends/ArmComputeTensorUtils.hpp"
+#include "backends/ArmComputeUtils.hpp"
+#include "backends/ClLayerSupport.hpp"
namespace armnn
{
using namespace armcomputetensorutils;
+arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights);
+
+ arm_compute::TensorInfo aclBiases;
+ arm_compute::TensorInfo *optionalAclBiases = nullptr;
+ if (descriptor.m_BiasEnabled)
+ {
+ aclBiases = BuildArmComputeTensorInfo(biases);
+ optionalAclBiases = &aclBiases;
+ }
+
+ const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo =
+ ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor);
+
+ return arm_compute::CLFullyConnectedLayer::validate(&aclInput,
+ &aclWeights,
+ optionalAclBiases,
+ &aclOutput,
+ fullyConnectedLayerInfo);
+}
+
ClFullyConnectedFloat32Workload::ClFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor,
const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : Float32Workload<FullyConnectedQueueDescriptor>(descriptor, info)
- , m_FullyConnected(memoryManager)
+ : FloatWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
+ , m_FullyConnectedLayer(memoryManager)
{
+ m_WeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo());
- BuildArmComputeTensor(m_WeightsTensor, m_Data.m_Weight->GetTensorInfo());
-
- arm_compute::CLTensor* optionalBiasTensor = nullptr;
if (m_Data.m_Parameters.m_BiasEnabled)
{
- BuildArmComputeTensor(m_BiasesTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBiasTensor = &m_BiasesTensor;
+ m_BiasesTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo());
}
m_Data.ValidateInputsOutputs("ClFullyConnectedFloat32Workload", 1, 1);
arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
// Construct
- m_FullyConnected.configure(
- &input, &m_WeightsTensor, optionalBiasTensor, &output, m_Data.m_Parameters.m_TransposeWeightMatrix);
+ arm_compute::FullyConnectedLayerInfo fc_info;
+ fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix;
+ m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
// Allocate
- InitialiseArmComputeClTensorData(m_WeightsTensor, m_Data.m_Weight->GetConstTensor<float>());
+ InitializeArmComputeClTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight);
- if (optionalBiasTensor)
+ if (m_BiasesTensor)
{
- InitialiseArmComputeClTensorData(*optionalBiasTensor, m_Data.m_Bias->GetConstTensor<float>());
+ InitializeArmComputeClTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias);
}
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_FullyConnectedLayer.prepare();
+ FreeUnusedTensors();
}
void ClFullyConnectedFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFullyConnectedFloat32Workload_Execute");
- m_FullyConnected.run();
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClFullyConnectedFloat32Workload_Execute");
+ m_FullyConnectedLayer.run();
+}
+
+void ClFullyConnectedFloat32Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_WeightsTensor);
+ FreeTensorIfUnused(m_BiasesTensor);
}
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp
index c8d1227bda..f580e580c6 100644
--- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp
@@ -14,20 +14,29 @@
namespace armnn
{
-class ClFullyConnectedFloat32Workload : public armnn::Float32Workload<armnn::FullyConnectedQueueDescriptor>
+arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor);
+
+class ClFullyConnectedFloat32Workload : public armnn::FloatWorkload<armnn::FullyConnectedQueueDescriptor>
{
public:
ClFullyConnectedFloat32Workload(const armnn::FullyConnectedQueueDescriptor& descriptor,
const armnn::WorkloadInfo& info,
std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
- using armnn::Float32Workload<armnn::FullyConnectedQueueDescriptor>::m_Data;
+ using armnn::FloatWorkload<armnn::FullyConnectedQueueDescriptor>::m_Data;
void Execute() const override;
private:
- mutable arm_compute::CLFullyConnectedLayer m_FullyConnected;
- arm_compute::CLTensor m_WeightsTensor;
- arm_compute::CLTensor m_BiasesTensor;
+ mutable arm_compute::CLFullyConnectedLayer m_FullyConnectedLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_WeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasesTensor;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp
index e15db74ec9..628e38d3da 100644
--- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp
@@ -12,9 +12,21 @@ namespace armnn
{
using namespace armcomputetensorutils;
+arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output);
+
+ arm_compute::NormalizationLayerInfo normalizationInfo =
+ CreateAclNormalizationLayerInfoForL2Normalization(input);
+
+ return arm_compute::CLNormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo);
+}
+
ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<L2NormalizationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("ClL2NormalizationFloat32Workload", 1, 1);
@@ -25,7 +37,7 @@ ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2Norma
void ClL2NormalizationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClL2NormalizationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClL2NormalizationFloat32Workload_Execute");
m_Layer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp
index 848803e2f0..bf898e31f7 100644
--- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp
@@ -10,7 +10,10 @@
namespace armnn
{
-class ClL2NormalizationFloat32Workload : public Float32Workload<L2NormalizationQueueDescriptor>
+arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output);
+
+class ClL2NormalizationFloat32Workload : public FloatWorkload<L2NormalizationQueueDescriptor>
{
public:
ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp
new file mode 100644
index 0000000000..db5c303854
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp
@@ -0,0 +1,405 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClLstmFloat32Workload.hpp"
+#include "backends/ClTensorHandle.hpp"
+#include "backends/CpuTensorHandle.hpp"
+#include "backends/ArmComputeTensorUtils.hpp"
+#include "backends/ClLayerSupport.hpp"
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+ClLstmFloat32Workload::ClLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
+ : FloatWorkload<LstmQueueDescriptor>(descriptor, info)
+{
+ arm_compute::LSTMParams<arm_compute::ICLTensor> lstm_param;
+
+ // Basic parameters
+ m_InputToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo());
+
+ m_InputToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo());
+
+ m_InputToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo());
+
+ m_RecurrentToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo());
+
+ m_RecurrentToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo());
+
+ m_RecurrentToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo());
+
+ m_ForgetGateBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo());
+
+ m_CellBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo());
+
+ m_OutputGateBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo());
+
+ // for future reference: check the AndroidNN API for the logic here
+ if (!m_Data.m_Parameters.m_CifgEnabled)
+ {
+ m_InputToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo());
+
+ m_RecurrentToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo());
+
+ m_CellToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ if (m_Data.m_CellToInputWeights != nullptr)
+ {
+ BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo());
+ }
+
+ m_InputGateBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo());
+
+ lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(),
+ m_RecurrentToInputWeightsTensor.get(),
+ m_Data.m_CellToInputWeights != nullptr ? m_CellToInputWeightsTensor.get() : nullptr,
+ m_InputGateBiasTensor.get());
+ }
+
+ if (m_Data.m_Parameters.m_ProjectionEnabled)
+ {
+ m_ProjectionWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo());
+
+ m_ProjectionBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ if (m_Data.m_ProjectionBias != nullptr)
+ {
+ BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo());
+ }
+
+ lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(),
+ m_Data.m_ProjectionBias != nullptr ? m_ProjectionBiasTensor.get() : nullptr);
+ }
+
+ if (m_Data.m_Parameters.m_PeepholeEnabled)
+ {
+ m_CellToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo());
+
+ m_CellToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo());
+
+ lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get());
+ }
+
+ const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ const arm_compute::ICLTensor& output_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ const arm_compute::ICLTensor& cell_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[2])->GetTensor();
+
+ arm_compute::ICLTensor& output_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[1])->GetTensor();
+ arm_compute::ICLTensor& cell_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[2])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[3])->GetTensor();
+
+ // Get the batch_size and the num_units from the cellStateIn dimensions
+ const TensorInfo& inputTensorInfo = info.m_InputTensorInfos[2];
+ const unsigned int batch_size = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[0]);
+ const unsigned int num_units = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[1]);
+
+ m_ScratchBuffer = std::make_unique<arm_compute::CLTensor>();
+ if (m_Data.m_Parameters.m_CifgEnabled)
+ {
+ // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG
+ armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32);
+ BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1);
+ }
+ else
+ {
+ // scratch_buffer [num_units * 3, batch_size] without CIFG
+ armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32);
+ BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2);
+ }
+
+ float cell_threshold = m_Data.m_Parameters.m_ClippingThresCell;
+ float projection_threshold = m_Data.m_Parameters.m_ClippingThresProj;
+
+ // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations
+ arm_compute::ActivationLayerInfo activationLayerInfo;
+ if (m_Data.m_Parameters.m_ActivationFunc == 0)
+ {
+ // no activation, do nothing
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 1)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 3)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0);
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 4)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0);
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 6)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC);
+ }
+ else
+ {
+ throw armnn::Exception("Wrong Type of Activation Function!");
+ }
+
+
+ m_LstmLayer.configure(&input, m_InputToForgetWeightsTensor.get(), m_InputToCellWeightsTensor.get(),
+ m_InputToOutputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(),
+ m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(),
+ m_ForgetGateBiasTensor.get(), m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(),
+ &output_state_in, &cell_state_in, m_ScratchBuffer.get(), &output_state_out,
+ &cell_state_out, &output, lstm_param, activationLayerInfo,
+ cell_threshold, projection_threshold);
+
+ armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer);
+
+ InitialiseArmComputeClTensorData(*m_InputToForgetWeightsTensor,
+ m_Data.m_InputToForgetWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_InputToCellWeightsTensor,
+ m_Data.m_InputToCellWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_InputToOutputWeightsTensor,
+ m_Data.m_InputToOutputWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_RecurrentToForgetWeightsTensor,
+ m_Data.m_RecurrentToForgetWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_RecurrentToCellWeightsTensor,
+ m_Data.m_RecurrentToCellWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_RecurrentToOutputWeightsTensor,
+ m_Data.m_RecurrentToOutputWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_ForgetGateBiasTensor,
+ m_Data.m_ForgetGateBias->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_CellBiasTensor,
+ m_Data.m_CellBias->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_OutputGateBiasTensor,
+ m_Data.m_OutputGateBias->GetConstTensor<float>());
+
+ if (!m_Data.m_Parameters.m_CifgEnabled)
+ {
+ InitialiseArmComputeClTensorData(*m_InputToInputWeightsTensor,
+ m_Data.m_InputToInputWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_RecurrentToInputWeightsTensor,
+ m_Data.m_RecurrentToInputWeights->GetConstTensor<float>());
+ if (m_Data.m_CellToInputWeights != nullptr)
+ {
+ InitialiseArmComputeClTensorData(*m_CellToInputWeightsTensor,
+ m_Data.m_CellToInputWeights->GetConstTensor<float>());
+ }
+ InitialiseArmComputeClTensorData(*m_InputGateBiasTensor,
+ m_Data.m_InputGateBias->GetConstTensor<float>());
+ }
+
+ if (m_Data.m_Parameters.m_ProjectionEnabled)
+ {
+ InitialiseArmComputeClTensorData(*m_ProjectionWeightsTensor,
+ m_Data.m_ProjectionWeights->GetConstTensor<float>());
+ if (m_Data.m_ProjectionBias != nullptr)
+ {
+ InitialiseArmComputeClTensorData(*m_ProjectionBiasTensor,
+ m_Data.m_ProjectionBias->GetConstTensor<float>());
+ }
+ }
+
+ if (m_Data.m_Parameters.m_PeepholeEnabled)
+ {
+ InitialiseArmComputeClTensorData(*m_CellToForgetWeightsTensor,
+ m_Data.m_CellToForgetWeights->GetConstTensor<float>());
+ InitialiseArmComputeClTensorData(*m_CellToOutputWeightsTensor,
+ m_Data.m_CellToOutputWeights->GetConstTensor<float>());
+ }
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_LstmLayer.prepare();
+ FreeUnusedTensors();
+}
+
+void ClLstmFloat32Workload::Execute() const
+{
+ m_LstmLayer.run();
+}
+
+arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights,
+ const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights,
+ const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights,
+ const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias,
+ const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights,
+ const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias,
+ const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias,
+ const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights)
+{
+ arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info;
+
+ // The inputs and the outputs
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn);
+ const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn);
+ const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer);
+ const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut);
+ const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ // Basic parameters
+ const arm_compute::TensorInfo aclInputToForgetWeightsInfo = BuildArmComputeTensorInfo(inputToForgetWeights);
+ const arm_compute::TensorInfo aclInputToCellWeightsInfo = BuildArmComputeTensorInfo(inputToCellWeights);
+ const arm_compute::TensorInfo aclInputToOutputWeightsInfo = BuildArmComputeTensorInfo(inputToOutputWeights);
+ const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo
+ = BuildArmComputeTensorInfo(recurrentToForgetWeights);
+ const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo
+ = BuildArmComputeTensorInfo(recurrentToCellWeights);
+ const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo
+ = BuildArmComputeTensorInfo(recurrentToOutputWeights);
+ const arm_compute::TensorInfo aclForgetGateBiasInfo = BuildArmComputeTensorInfo(forgetGateBias);
+ const arm_compute::TensorInfo aclCellBiasInfo = BuildArmComputeTensorInfo(cellBias);
+ const arm_compute::TensorInfo aclOutputGateBiasInfo = BuildArmComputeTensorInfo(outputGateBias);
+
+ arm_compute::TensorInfo aclInputToInputWeightsInfo;
+ arm_compute::TensorInfo aclRecurrentToInputWeightsInfo;
+ arm_compute::TensorInfo aclCellToInputWeightsInfo;
+ arm_compute::TensorInfo aclInputGateBiasInfo;
+ arm_compute::TensorInfo aclProjectionWeightsInfo;
+ arm_compute::TensorInfo aclProjectionBiasInfo;
+ arm_compute::TensorInfo aclCellToForgetWeightsInfo;
+ arm_compute::TensorInfo aclCellToOutputWeightsInfo;
+
+ if (!descriptor.m_CifgEnabled)
+ {
+ armnn::TensorInfo inputToInputWInfo = *inputToInputWeights;
+ aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(inputToInputWInfo);
+ armnn::TensorInfo recurrentToInputWInfo = *recurrentToInputWeights;
+ aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(recurrentToInputWInfo);
+
+ if (cellToInputWeights != nullptr)
+ {
+ armnn::TensorInfo cellToInputWInfo = *cellToInputWeights;
+ aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(cellToInputWInfo);
+ }
+ armnn::TensorInfo inputGateBiasInfo = *inputGateBias;
+ aclInputGateBiasInfo = BuildArmComputeTensorInfo(inputGateBiasInfo);
+ lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo, &aclRecurrentToInputWeightsInfo,
+ cellToInputWeights != nullptr ? &aclCellToInputWeightsInfo: nullptr,
+ &aclInputGateBiasInfo);
+ }
+
+ if (descriptor.m_ProjectionEnabled)
+ {
+ const armnn::TensorInfo& projectionWInfo = *projectionWeights;
+ aclProjectionWeightsInfo = BuildArmComputeTensorInfo(projectionWInfo);
+
+ if (projectionBias != nullptr)
+ {
+ const armnn::TensorInfo& projectionBiasInfo = *projectionBias;
+ aclProjectionBiasInfo = BuildArmComputeTensorInfo(projectionBiasInfo);
+ }
+ lstm_params_info.set_projection_params(&aclProjectionWeightsInfo,
+ projectionBias != nullptr ? &aclProjectionBiasInfo: nullptr);
+ }
+
+ if (descriptor.m_PeepholeEnabled)
+ {
+ const armnn::TensorInfo& cellToForgetWInfo = *cellToForgetWeights;
+ aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(cellToForgetWInfo);
+ const armnn::TensorInfo& cellToOutputWInfo = *cellToOutputWeights;
+ aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(cellToOutputWInfo);
+ lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo);
+ }
+
+ float cell_threshold = descriptor.m_ClippingThresCell;
+ float projection_threshold = descriptor.m_ClippingThresProj;
+
+ // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations
+ arm_compute::ActivationLayerInfo activationLayerInfo;
+ if (descriptor.m_ActivationFunc == 0)
+ {
+ // no activation, do nothing
+ }
+ else if (descriptor.m_ActivationFunc == 1)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
+ }
+ else if (descriptor.m_ActivationFunc == 3)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0);
+ }
+ else if (descriptor.m_ActivationFunc == 4)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0);
+ }
+ else if (descriptor.m_ActivationFunc == 6)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC);
+ }
+ else
+ {
+ throw armnn::Exception("Wrong Type of Activation Function!");
+ }
+
+ return arm_compute::CLLSTMLayer::validate(&aclInputInfo, &aclInputToForgetWeightsInfo,
+ &aclInputToCellWeightsInfo,
+ &aclInputToOutputWeightsInfo,
+ &aclRecurrentToForgetWeightsInfo,
+ &aclRecurrentToCellWeightsInfo,
+ &aclRecurrentToOutputWeightsInfo,
+ &aclForgetGateBiasInfo,
+ &aclCellBiasInfo,
+ &aclOutputGateBiasInfo,
+ &aclOutputStateInInfo, &aclCellStateInInfo,
+ &aclScratchBufferInfo, &aclOutputStateOutInfo,
+ &aclCellStateOutInfo, &aclOutputInfo,
+ lstm_params_info, activationLayerInfo,
+ cell_threshold, projection_threshold);
+}
+
+void ClLstmFloat32Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_InputToInputWeightsTensor);
+ FreeTensorIfUnused(m_InputToForgetWeightsTensor);
+ FreeTensorIfUnused(m_InputToCellWeightsTensor);
+ FreeTensorIfUnused(m_InputToOutputWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToInputWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToForgetWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToCellWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToOutputWeightsTensor);
+ FreeTensorIfUnused(m_CellToInputWeightsTensor);
+ FreeTensorIfUnused(m_CellToForgetWeightsTensor);
+ FreeTensorIfUnused(m_CellToOutputWeightsTensor);
+ FreeTensorIfUnused(m_InputGateBiasTensor);
+ FreeTensorIfUnused(m_ForgetGateBiasTensor);
+ FreeTensorIfUnused(m_CellBiasTensor);
+ FreeTensorIfUnused(m_OutputGateBiasTensor);
+ FreeTensorIfUnused(m_ProjectionWeightsTensor);
+ FreeTensorIfUnused(m_ProjectionBiasTensor);
+ FreeTensorIfUnused(m_ScratchBuffer);
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp
new file mode 100644
index 0000000000..e2358ad10d
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp
@@ -0,0 +1,67 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClWorkloadUtils.hpp"
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+
+namespace armnn
+{
+
+class ClLstmFloat32Workload : public FloatWorkload<LstmQueueDescriptor>
+{
+public:
+ ClLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLLSTMLayer m_LstmLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_InputToInputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputToForgetWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputToCellWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputToOutputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToInputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToForgetWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToCellWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToOutputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellToInputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellToForgetWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellToOutputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputGateBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_ForgetGateBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_OutputGateBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_ProjectionWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_ProjectionBiasTensor;
+
+ std::unique_ptr<arm_compute::CLTensor> m_ScratchBuffer;
+
+ void FreeUnusedTensors();
+};
+
+arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor &descriptor,
+ const TensorInfo& inputToForgetWeights,
+ const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights,
+ const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights,
+ const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias,
+ const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights,
+ const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias,
+ const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias,
+ const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights);
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp
index 4d2d708a0e..89e7690a36 100644
--- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp
@@ -11,7 +11,7 @@ namespace armnn
void ClMergerFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerFloat32Workload_Execute");
ClBaseMergerWorkload::Execute();
}
diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp
index 9808d30ccf..3cafa23c1e 100644
--- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp
@@ -10,10 +10,10 @@
namespace armnn
{
-class ClMergerFloat32Workload : public ClBaseMergerWorkload<armnn::DataType::Float32>
+class ClMergerFloat32Workload : public ClBaseMergerWorkload<DataType::Float16, DataType::Float32>
{
public:
- using ClBaseMergerWorkload<armnn::DataType::Float32>::ClBaseMergerWorkload;
+ using ClBaseMergerWorkload<DataType::Float16, DataType::Float32>::ClBaseMergerWorkload;
virtual void Execute() const override;
};
diff --git a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp
index 94a1d3c593..551135b7da 100644
--- a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp
@@ -11,7 +11,7 @@ namespace armnn
void ClMergerUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerUint8Workload_Execute");
ClBaseMergerWorkload<DataType::QuantisedAsymm8>::Execute();
}
diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp
index 405d109aa1..7aa33146f3 100644
--- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp
@@ -10,9 +10,29 @@
namespace armnn
{
+arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it,
+ // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be
+ // ignored for F32 tensors.
+ return arm_compute::CLPixelWiseMultiplication::validate(&aclInput1,
+ &aclInput2,
+ &aclOutput,
+ 1.0f,
+ arm_compute::ConvertPolicy::SATURATE,
+ arm_compute::RoundingPolicy::TO_ZERO);
+}
+
+
ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<MultiplicationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<MultiplicationQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("ClMultiplicationFloat32Workload", 2, 1);
@@ -30,9 +50,9 @@ ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const Multiplic
void ClMultiplicationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMultiplicationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClMultiplicationFloat32Workload_Execute");
- // Execute the layer
+ // Executes the layer.
m_PixelWiseMultiplication.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp
index 8e387118e8..0d6199047d 100644
--- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp
@@ -9,12 +9,17 @@
namespace armnn
{
-class ClMultiplicationFloat32Workload : public Float32Workload<MultiplicationQueueDescriptor>
+
+arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
+
+class ClMultiplicationFloat32Workload : public FloatWorkload<MultiplicationQueueDescriptor>
{
public:
ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info);
- using Float32Workload<MultiplicationQueueDescriptor>::Float32Workload;
+ using FloatWorkload<MultiplicationQueueDescriptor>::FloatWorkload;
void Execute() const override;
private:
diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp
index a163ec2883..d23d6e11bd 100644
--- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp
@@ -27,7 +27,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input, con
ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<NormalizationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("ClNormalizationFloat32Workload", 1, 1);
@@ -42,7 +42,7 @@ ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const Normalizati
void ClNormalizationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClNormalizationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClNormalizationFloat32Workload_Execute");
m_NormalizationLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp
index cbd5fa92a9..e8ab0b9a18 100644
--- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp
@@ -14,7 +14,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input,
const TensorInfo& output,
const NormalizationDescriptor& descriptor);
-class ClNormalizationFloat32Workload : public Float32Workload<NormalizationQueueDescriptor>
+class ClNormalizationFloat32Workload : public FloatWorkload<NormalizationQueueDescriptor>
{
public:
ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp
index 3147e95b2e..3c132cb8f8 100644
--- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp
@@ -24,10 +24,10 @@ arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descripto
return arm_compute::Status{};
}
-template <armnn::DataType DataType>
-ClPermuteWorkload<DataType>::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor,
+template <armnn::DataType... DataTypes>
+ClPermuteWorkload<DataTypes...>::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : TypedWorkload<PermuteQueueDescriptor, DataType>(descriptor, info)
+ : TypedWorkload<PermuteQueueDescriptor, DataTypes...>(descriptor, info)
{
using armcomputetensorutils::BuildArmComputePermutationVector;
@@ -37,18 +37,18 @@ ClPermuteWorkload<DataType>::ClPermuteWorkload(const PermuteQueueDescriptor& des
arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
- // Run the layer
+ // Run the layer.
m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings));
}
-template <armnn::DataType DataType>
-void ClPermuteWorkload<DataType>::Execute() const
+template <armnn::DataType... DataTypes>
+void ClPermuteWorkload<DataTypes...>::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, GetName() + "_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL( GetName() + "_Execute");
m_PermuteFunction.run();
}
-template class ClPermuteWorkload<DataType::Float32>;
+template class ClPermuteWorkload<DataType::Float16, DataType::Float32>;
template class ClPermuteWorkload<DataType::QuantisedAsymm8>;
} // namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp
index 430c59524e..c8726bc2c6 100644
--- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp
@@ -7,6 +7,7 @@
#include "backends/Workload.hpp"
#include "backends/WorkloadData.hpp"
+#include "backends/ClWorkloadUtils.hpp"
#include <armnn/TypesUtils.hpp>
#include <arm_compute/runtime/CL/functions/CLPermute.h>
@@ -18,13 +19,13 @@ namespace armnn
arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descriptor);
-template <armnn::DataType DataType>
-class ClPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataType>
+template<armnn::DataType... DataTypes>
+class ClPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataTypes...>
{
public:
static const std::string& GetName()
{
- static const std::string name = std::string("ClPermute") + GetDataTypeName(DataType) + "Workload";
+ static const std::string name = std::string("ClPermuteWorkload");
return name;
}
@@ -32,11 +33,11 @@ public:
void Execute() const override;
private:
- using TypedWorkload<PermuteQueueDescriptor, DataType>::m_Data;
+ using TypedWorkload<PermuteQueueDescriptor, DataTypes...>::m_Data;
mutable arm_compute::CLPermute m_PermuteFunction;
};
-using ClPermuteFloat32Workload = ClPermuteWorkload<DataType::Float32>;
+using ClPermuteFloatWorkload = ClPermuteWorkload<DataType::Float16, DataType::Float32>;
using ClPermuteUint8Workload = ClPermuteWorkload<DataType::QuantisedAsymm8>;
-} //namespace armnn
+} // namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp
index dbdc06f174..6b8a230912 100644
--- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp
@@ -25,10 +25,10 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input,
return arm_compute::CLPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo);
}
-template <armnn::DataType dataType>
-ClPooling2dBaseWorkload<dataType>::ClPooling2dBaseWorkload(
+template <armnn::DataType... dataTypes>
+ClPooling2dBaseWorkload<dataTypes...>::ClPooling2dBaseWorkload(
const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name)
- : TypedWorkload<Pooling2dQueueDescriptor, dataType>(descriptor, info)
+ : TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>(descriptor, info)
{
m_Data.ValidateInputsOutputs(name, 1, 1);
@@ -37,11 +37,11 @@ ClPooling2dBaseWorkload<dataType>::ClPooling2dBaseWorkload(
arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters);
- // Run the layer
+ // Run the layer.
m_PoolingLayer.configure(&input, &output, layerInfo);
}
-template class ClPooling2dBaseWorkload<DataType::Float32>;
+template class ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>;
template class ClPooling2dBaseWorkload<DataType::QuantisedAsymm8>;
}
diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp
index 828f000505..aea32c9e86 100644
--- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp
@@ -14,12 +14,12 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input,
const TensorInfo& output,
const Pooling2dDescriptor& descriptor);
-// Base class template providing an implementation of the Pooling2d layer common to all data types
-template <armnn::DataType dataType>
-class ClPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataType>
+// Base class template providing an implementation of the Pooling2d layer common to all data types.
+template <armnn::DataType... dataTypes>
+class ClPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>
{
public:
- using TypedWorkload<Pooling2dQueueDescriptor, dataType>::m_Data;
+ using TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>::m_Data;
ClPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info,
const std::string& name);
diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp
index a7f5855b8a..3a5b8ca526 100644
--- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp
@@ -10,13 +10,13 @@ namespace armnn
ClPooling2dFloat32Workload::ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : ClPooling2dBaseWorkload<DataType::Float32>(descriptor, info, "ClPooling2dFloat32Workload")
+ : ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>(descriptor, info, "ClPooling2dFloat32Workload")
{
}
void ClPooling2dFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dFloat32Workload_Execute");
m_PoolingLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp
index 3456a2cff8..ad189bdb52 100644
--- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload<DataType::Float32>
+class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>
{
public:
ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp
index 2d2109e252..94cf753f5a 100644
--- a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp
@@ -16,7 +16,7 @@ ClPooling2dUint8Workload::ClPooling2dUint8Workload(const Pooling2dQueueDescripto
void ClPooling2dUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dUint8Workload_Execute");
m_PoolingLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp
index 7b4ad4415b..05fba222ac 100644
--- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp
@@ -11,7 +11,7 @@ namespace armnn
{
ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info)
- : Float32Workload<ReshapeQueueDescriptor>(descriptor, info)
+ : FloatWorkload<ReshapeQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("ClReshapeFloat32Workload", 1, 1);
@@ -23,7 +23,7 @@ ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor&
void ClReshapeFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeFloat32Workload_Execute");
m_Layer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp
index e344ee08ad..0eb4d08da0 100644
--- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-class ClReshapeFloat32Workload : public Float32Workload<ReshapeQueueDescriptor>
+class ClReshapeFloat32Workload : public FloatWorkload<ReshapeQueueDescriptor>
{
public:
ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp
index 36cc1dec17..050fb9aa33 100644
--- a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp
@@ -21,7 +21,7 @@ ClReshapeUint8Workload::ClReshapeUint8Workload(const ReshapeQueueDescriptor& des
void ClReshapeUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeUint8Workload_Execute");
m_Layer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp
index d71011a2e3..abef682611 100644
--- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp
@@ -14,7 +14,7 @@ namespace armnn
ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<ResizeBilinearQueueDescriptor>(descriptor, info)
+ : FloatWorkload<ResizeBilinearQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("ClResizeBilinearFloat32Workload", 1, 1);
@@ -28,7 +28,7 @@ ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBil
void ClResizeBilinearFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClResizeBilinearFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClResizeBilinearFloat32Workload_Execute");
m_ResizeBilinearLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp
index 5f70e71619..81c0566bb3 100644
--- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-class ClResizeBilinearFloat32Workload : public Float32Workload<ResizeBilinearQueueDescriptor>
+class ClResizeBilinearFloat32Workload : public FloatWorkload<ResizeBilinearQueueDescriptor>
{
public:
ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp
new file mode 100644
index 0000000000..cd3107cfe1
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClSoftmaxBaseWorkload.hpp"
+
+#include "backends/ArmComputeTensorUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output)
+{
+ // NOTE: We report 4D Softmax as unsupported until full support is added to ACL
+ if(input.GetShape().GetNumDimensions() >= 4u)
+ {
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported");
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::CLSoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo);
+}
+
+}
diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp
new file mode 100644
index 0000000000..e0113134af
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp
@@ -0,0 +1,16 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output);
+
+} // namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp
index 1d05172b42..08247bc593 100644
--- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp
@@ -12,7 +12,7 @@ namespace armnn
ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : Float32Workload<SoftmaxQueueDescriptor>(descriptor, info)
+ : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info)
, m_SoftmaxLayer(memoryManager)
{
m_Data.ValidateInputsOutputs("ClSoftmaxFloat32Workload", 1, 1);
@@ -24,7 +24,7 @@ ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor&
void ClSoftmaxFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxFloat32Workload_Execute");
m_SoftmaxLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp
index cf5c45ac6f..6cad59800b 100644
--- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp
@@ -14,7 +14,7 @@
namespace armnn
{
-class ClSoftmaxFloat32Workload : public Float32Workload<SoftmaxQueueDescriptor>
+class ClSoftmaxFloat32Workload : public FloatWorkload<SoftmaxQueueDescriptor>
{
public:
ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp
index ee9ab4754b..3cd9a6a5ec 100644
--- a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp
@@ -33,7 +33,7 @@ ClSoftmaxUint8Workload::ClSoftmaxUint8Workload(const SoftmaxQueueDescriptor& des
void ClSoftmaxUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxUint8Workload_Execute");
m_SoftmaxLayer.run();
}
diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp
index 6221d56766..8a622c6caf 100644
--- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void ClSplitterFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterFloat32Workload_Execute");
ClBaseSplitterWorkload::Execute();
}
diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp
index cfc7eaa3c2..affa9f840f 100644
--- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp
+++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp
@@ -10,10 +10,10 @@
namespace armnn
{
-class ClSplitterFloat32Workload : public ClBaseSplitterWorkload<DataType::Float32>
+class ClSplitterFloat32Workload : public ClBaseSplitterWorkload<DataType::Float16, DataType::Float32>
{
public:
- using ClBaseSplitterWorkload<DataType::Float32>::ClBaseSplitterWorkload;
+ using ClBaseSplitterWorkload<DataType::Float16, DataType::Float32>::ClBaseSplitterWorkload;
virtual void Execute() const override;
};
diff --git a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp
index 3aa470894c..d2d25495e0 100644
--- a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp
+++ b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void ClSplitterUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterUint8Workload_Execute");
ClBaseSplitterWorkload::Execute();
}
diff --git a/src/armnn/backends/CpuTensorHandle.cpp b/src/armnn/backends/CpuTensorHandle.cpp
index dd8176c9ec..78cf6efd2e 100644
--- a/src/armnn/backends/CpuTensorHandle.cpp
+++ b/src/armnn/backends/CpuTensorHandle.cpp
@@ -45,6 +45,12 @@ ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ConstTensor& tensor)
CopyFrom(tensor.GetMemoryArea(), tensor.GetNumBytes());
}
+ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ConstCpuTensorHandle& tensorHandle)
+: ScopedCpuTensorHandle(tensorHandle.GetTensorInfo())
+{
+ CopyFrom(tensorHandle.GetConstTensor<void>(), tensorHandle.GetTensorInfo().GetNumBytes());
+}
+
ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ScopedCpuTensorHandle& other)
: CpuTensorHandle(other.GetTensorInfo())
{
diff --git a/src/armnn/backends/CpuTensorHandle.hpp b/src/armnn/backends/CpuTensorHandle.hpp
index 4bf4439083..3376650ec3 100644
--- a/src/armnn/backends/CpuTensorHandle.hpp
+++ b/src/armnn/backends/CpuTensorHandle.hpp
@@ -9,10 +9,12 @@
#include "OutputHandler.hpp"
+#include <algorithm>
+
namespace armnn
{
-// Abstract tensor handle wrapping a CPU-readable region of memory, interpreting it as tensor data.
+// Abstract tensor handles wrapping a CPU-readable region of memory, interpreting it as tensor data.
class ConstCpuTensorHandle : public ITensorHandle
{
public:
@@ -33,6 +35,30 @@ public:
return ITensorHandle::Cpu;
}
+ virtual void Manage() override {}
+
+ virtual ITensorHandle* GetParent() const override { return nullptr; }
+
+ virtual const void* Map(bool /* blocking = true */) const override { return m_Memory; }
+ virtual void Unmap() const override {}
+
+ TensorShape GetStrides() const override
+ {
+ TensorShape shape(m_TensorInfo.GetShape());
+ auto size = GetDataTypeSize(m_TensorInfo.GetDataType());
+ auto runningSize = size;
+ std::vector<unsigned int> strides(shape.GetNumDimensions());
+ auto lastIdx = shape.GetNumDimensions()-1;
+ for (unsigned int i=0; i < lastIdx ; i++)
+ {
+ strides[lastIdx-i] = runningSize;
+ runningSize *= shape[lastIdx-i];
+ }
+ strides[0] = runningSize;
+ return TensorShape(shape.GetNumDimensions(), strides.data());
+ }
+ TensorShape GetShape() const override { return m_TensorInfo.GetShape(); }
+
protected:
ConstCpuTensorHandle(const TensorInfo& tensorInfo);
@@ -46,7 +72,7 @@ private:
const void* m_Memory;
};
-// Abstract specialization of ConstCpuTensorHandle that allows write access to the same data
+// Abstract specialization of ConstCpuTensorHandle that allows write access to the same data.
class CpuTensorHandle : public ConstCpuTensorHandle
{
public:
@@ -79,9 +105,12 @@ class ScopedCpuTensorHandle : public CpuTensorHandle
public:
explicit ScopedCpuTensorHandle(const TensorInfo& tensorInfo);
- // Copies contents from Tensor
+ // Copies contents from Tensor.
explicit ScopedCpuTensorHandle(const ConstTensor& tensor);
+ // Copies contents from ConstCpuTensorHandle
+ explicit ScopedCpuTensorHandle(const ConstCpuTensorHandle& tensorHandle);
+
ScopedCpuTensorHandle(const ScopedCpuTensorHandle& other);
ScopedCpuTensorHandle& operator=(const ScopedCpuTensorHandle& other);
~ScopedCpuTensorHandle();
@@ -98,7 +127,7 @@ private:
// Clients must make sure the passed in memory region stays alive for the lifetime of
// the PassthroughCpuTensorHandle instance.
//
-// Note there is no polymorphism to/from ConstPassthroughCpuTensorHandle
+// Note there is no polymorphism to/from ConstPassthroughCpuTensorHandle.
class PassthroughCpuTensorHandle : public CpuTensorHandle
{
public:
@@ -117,7 +146,7 @@ public:
// Clients must make sure the passed in memory region stays alive for the lifetime of
// the PassthroughCpuTensorHandle instance.
//
-// Note there is no polymorphism to/from PassthroughCpuTensorHandle
+// Note there is no polymorphism to/from PassthroughCpuTensorHandle.
class ConstPassthroughCpuTensorHandle : public ConstCpuTensorHandle
{
public:
@@ -131,7 +160,7 @@ public:
};
-// template specializations
+// Template specializations.
template <>
const void* ConstCpuTensorHandle::GetConstTensor() const;
diff --git a/src/armnn/backends/ITensorHandle.hpp b/src/armnn/backends/ITensorHandle.hpp
index b95dcc65e0..ab571ab305 100644
--- a/src/armnn/backends/ITensorHandle.hpp
+++ b/src/armnn/backends/ITensorHandle.hpp
@@ -7,6 +7,8 @@
namespace armnn
{
+class TensorShape;
+
class ITensorHandle
{
public:
@@ -18,8 +20,54 @@ public:
};
virtual ~ITensorHandle(){}
+
+ /// Indicate to the memory manager that this resource is active.
+ /// This is used to compute overlapping lifetimes of resources.
+ virtual void Manage() = 0;
+
+ /// Indicate to the memory manager that this resource is no longer active.
+ /// This is used to compute overlapping lifetimes of resources.
virtual void Allocate() = 0;
+
+ /// Get the type backend associated with the tensor handle.
+ /// \return Type enum
virtual ITensorHandle::Type GetType() const = 0;
+
+ /// Get the parent tensor if this is a subtensor.
+ /// \return a pointer to the parent tensor. Otherwise nullptr if not a subtensor.
+ virtual ITensorHandle* GetParent() const = 0;
+
+ /// Map the tensor data for access.
+ /// \param blocking hint to block the calling thread until all other accesses are complete. (backend dependent)
+ /// \return pointer to the first element of the mapped data.
+ virtual const void* Map(bool blocking=true) const = 0;
+
+ /// Unmap the tensor data
+ virtual void Unmap() const = 0;
+
+ /// Map the tensor data for access. Must be paired with call to Unmap().
+ /// \param blocking hint to block the calling thread until all other accesses are complete. (backend dependent)
+ /// \return pointer to the first element of the mapped data.
+ void* Map(bool blocking=true)
+ {
+ return const_cast<void*>(static_cast<const ITensorHandle*>(this)->Map(blocking));
+ }
+
+ /// Unmap the tensor data that was previously mapped with call to Map().
+ void Unmap()
+ {
+ return static_cast<const ITensorHandle*>(this)->Unmap();
+ }
+
+ /// Get the strides for each dimension ordered from largest to smallest where
+ /// the smallest value is the same as the size of a single element in the tensor.
+ /// \return a TensorShape filled with the strides for each dimension
+ virtual TensorShape GetStrides() const = 0;
+
+ /// Get the number of elements for each dimension orderd from slowest iterating dimension
+ /// to fastest iterating dimension.
+ /// \return a TensorShape filled with the number of elements for each dimension.
+ virtual TensorShape GetShape() const = 0;
};
}
diff --git a/src/armnn/backends/MakeWorkloadHelper.hpp b/src/armnn/backends/MakeWorkloadHelper.hpp
index a1f9b0b0eb..64a7f8983b 100644
--- a/src/armnn/backends/MakeWorkloadHelper.hpp
+++ b/src/armnn/backends/MakeWorkloadHelper.hpp
@@ -9,7 +9,7 @@ namespace armnn
namespace
{
-// Make a workload of the specified WorkloadType
+// Make a workload of the specified WorkloadType.
template<typename WorkloadType>
struct MakeWorkloadForType
{
@@ -37,7 +37,8 @@ struct MakeWorkloadForType<NullWorkload>
// Makes a workload for one the specified types based on the data type requirements of the tensorinfo.
// Specify type void as the WorkloadType for unsupported DataType/WorkloadType combos.
-template <typename Float32Workload, typename Uint8Workload, typename QueueDescriptorType, typename... Args>
+template <typename Float16Workload, typename Float32Workload, typename Uint8Workload, typename QueueDescriptorType,
+ typename... Args>
std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info, Args&&... args)
{
const DataType dataType = !info.m_InputTensorInfos.empty() ?
@@ -49,6 +50,8 @@ std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, c
switch (dataType)
{
+ case DataType::Float16:
+ return MakeWorkloadForType<Float16Workload>::Func(descriptor, info, std::forward<Args>(args)...);
case DataType::Float32:
return MakeWorkloadForType<Float32Workload>::Func(descriptor, info, std::forward<Args>(args)...);
case DataType::QuantisedAsymm8:
@@ -59,5 +62,17 @@ std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, c
}
}
+// Makes a workload for one the specified types based on the data type requirements of the tensorinfo.
+// Calling this method is the equivalent of calling the three typed MakeWorkload method with <FloatWorkload,
+// FloatWorkload, Uint8Workload>.
+// Specify type void as the WorkloadType for unsupported DataType/WorkloadType combos.
+template <typename FloatWorkload, typename Uint8Workload, typename QueueDescriptorType, typename... Args>
+std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info, Args&&... args)
+{
+ return MakeWorkload<FloatWorkload, FloatWorkload, Uint8Workload>(descriptor, info,
+ std::forward<Args>(args)...);
+}
+
+
} //namespace
} //namespace armnn
diff --git a/src/armnn/backends/MemCopyWorkload.cpp b/src/armnn/backends/MemCopyWorkload.cpp
index 09ffd9a08a..27e60f93b7 100644
--- a/src/armnn/backends/MemCopyWorkload.cpp
+++ b/src/armnn/backends/MemCopyWorkload.cpp
@@ -4,14 +4,7 @@
//
#include "MemCopyWorkload.hpp"
#include "backends/CpuTensorHandle.hpp"
-
-#if ARMCOMPUTECL_ENABLED
-#include "backends/ClTensorHandle.hpp"
-#endif
-
-#if ARMCOMPUTENEON_ENABLED
-#include "backends/NeonTensorHandle.hpp"
-#endif
+#include "TypeUtils.hpp"
#include <cstring>
#include <boost/cast.hpp>
@@ -26,7 +19,7 @@ template <typename SrcTensorHandleType, typename DstTensorHandleType>
void GatherTensorHandlePairs(const MemCopyQueueDescriptor& descriptor,
std::vector<std::pair<SrcTensorHandleType*, DstTensorHandleType*>>& tensorHandlePairs)
{
- const unsigned int numInputs = boost::numeric_cast<unsigned int>(descriptor.m_Inputs.size());
+ const unsigned int numInputs = static_cast<unsigned int>(descriptor.m_Inputs.size());
tensorHandlePairs.reserve(numInputs);
for (unsigned int i = 0; i < numInputs; ++i)
@@ -40,217 +33,29 @@ void GatherTensorHandlePairs(const MemCopyQueueDescriptor& descriptor,
}
}
-void CopyFromCpuToCpu(const ConstCpuTensorHandle& srcHandle, CpuTensorHandle& dstHandle)
-{
- const unsigned int numBytes = srcHandle.GetTensorInfo().GetNumBytes();
- const void* const input = srcHandle.GetConstTensor<void>();
- void* const output = dstHandle.GetTensor<void>();
- std::memcpy(output, input, numBytes);
-}
-
-#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED
-
-#include "backends/ArmComputeTensorUtils.hpp"
-
-template <armnn::DataType DataType>
-void CopyFromCpuToAclBackend(const ConstCpuTensorHandle& srcHandle, arm_compute::ITensor& dstAclTensor)
-{
- using T = ResolveType<DataType>;
- armnn::armcomputetensorutils::CopyArmComputeITensorData(srcHandle.GetConstTensor<T>(), dstAclTensor);
-}
-
-template <armnn::DataType DataType>
-void CopyFromAclBackendToCpu(const arm_compute::ITensor& srcAclTensor, CpuTensorHandle& dstHandle)
-{
- using T = ResolveType<DataType>;
- armnn::armcomputetensorutils::CopyArmComputeITensorData(srcAclTensor, dstHandle.GetTensor<T>());
-}
-
-#endif // ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED
-
-}
-
-template <armnn::DataType DataType>
-CopyFromCpuToCpuWorkload<DataType>::CopyFromCpuToCpuWorkload(const MemCopyQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info)
-{
- GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
-}
-
-template <armnn::DataType DataType>
-void CopyFromCpuToCpuWorkload<DataType>::Execute() const
-{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "CopyFromCpuToCpuWorkload_Execute");
-
- for (const auto& pair : m_TensorHandlePairs)
- {
- CopyFromCpuToCpu(*pair.first, *pair.second);
- }
-}
-
-template class CopyFromCpuToCpuWorkload<DataType::Float32>;
-template class CopyFromCpuToCpuWorkload<DataType::QuantisedAsymm8>;
-
-#if ARMCOMPUTECL_ENABLED
-
-template <armnn::DataType DataType>
-CopyFromCpuToClWorkload<DataType>::CopyFromCpuToClWorkload(const MemCopyQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info)
-{
- GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
-}
-
-template <armnn::DataType DataType>
-void CopyFromCpuToClWorkload<DataType>::Execute() const
-{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromCpuToClWorkload_Execute");
-
- for (const auto& pair : m_TensorHandlePairs)
- {
- IClTensorHandle& handle = *pair.second;
-
- handle.Map(true);
- CopyFromCpuToAclBackend<DataType>(*pair.first, handle.GetTensor());
- handle.UnMap();
- }
-}
-
-template class CopyFromCpuToClWorkload<DataType::Float32>;
-template class CopyFromCpuToClWorkload<DataType::QuantisedAsymm8>;
-
-
-template <armnn::DataType DataType>
-CopyFromClToCpuWorkload<DataType>::CopyFromClToCpuWorkload(const MemCopyQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info)
-{
- GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
-}
-
-template <armnn::DataType DataType>
-void CopyFromClToCpuWorkload<DataType>::Execute() const
-{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromClToCpuWorkload_Execute");
-
- for (const auto& pair : m_TensorHandlePairs)
- {
- IClTensorHandle& handle = *pair.first;
-
- handle.Map(true);
- CopyFromAclBackendToCpu<DataType>(handle.GetTensor(), *pair.second);
- handle.UnMap();
- }
-}
-
-template class CopyFromClToCpuWorkload<DataType::Float32>;
-template class CopyFromClToCpuWorkload<DataType::QuantisedAsymm8>;
-
-#endif // ARMCOMPUTECL_ENABLED
+} //namespace
-#if ARMCOMPUTENEON_ENABLED
-template <armnn::DataType DataType>
-CopyFromCpuToNeonWorkload<DataType>::CopyFromCpuToNeonWorkload(const MemCopyQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info)
+CopyMemGenericWorkload::CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : BaseWorkload<MemCopyQueueDescriptor>(descriptor, info)
{
GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
}
-template <armnn::DataType DataType>
-void CopyFromCpuToNeonWorkload<DataType>::Execute() const
+void CopyMemGenericWorkload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "CopyFromCpuToNeonWorkload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyMemGeneric_Execute");
- for (const auto& pair : m_TensorHandlePairs)
- {
- CopyFromCpuToAclBackend<DataType>(*pair.first, pair.second->GetTensor());
- }
-}
-
-template class CopyFromCpuToNeonWorkload<DataType::Float32>;
-template class CopyFromCpuToNeonWorkload<DataType::QuantisedAsymm8>;
-
-template <armnn::DataType DataType>
-CopyFromNeonToCpuWorkload<DataType>::CopyFromNeonToCpuWorkload(const MemCopyQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info)
-{
- GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
-}
-
-template <armnn::DataType DataType>
-void CopyFromNeonToCpuWorkload<DataType>::Execute() const
-{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "CopyFromNeonToCpuWorkload_Execute");
+ auto copyFunc = [](void* dst, const void* src, size_t size)
+ {
+ memcpy(dst, src, size);
+ };
for (const auto& pair : m_TensorHandlePairs)
{
- CopyFromAclBackendToCpu<DataType>(pair.first->GetTensor(), *pair.second);
+ CopyTensorContentsGeneric(pair.first, pair.second, copyFunc);
}
}
-template class CopyFromNeonToCpuWorkload<DataType::Float32>;
-template class CopyFromNeonToCpuWorkload<DataType::QuantisedAsymm8>;
-
-#endif // ARMCOMPUTENEON_ENABLED
-
-#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED
-
-template <armnn::DataType DataType>
-CopyFromNeonToClWorkload<DataType>::CopyFromNeonToClWorkload(const MemCopyQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info)
-{
- GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
-}
-
-template <armnn::DataType DataType>
-void CopyFromNeonToClWorkload<DataType>::Execute() const
-{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromNeonToClWorkload_Execute");
-
- for (const auto& pair : m_TensorHandlePairs)
- {
- IClTensorHandle& handle = *pair.second;
-
- handle.Map(true);
- handle.GetTensor().copy_from(pair.first->GetTensor());
- handle.UnMap();
- }
-}
-
-template class CopyFromNeonToClWorkload<DataType::Float32>;
-template class CopyFromNeonToClWorkload<DataType::QuantisedAsymm8>;
-
-template <armnn::DataType DataType>
-CopyFromClToNeonWorkload<DataType>::CopyFromClToNeonWorkload(const MemCopyQueueDescriptor& descriptor,
- const WorkloadInfo& info)
- : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info)
-{
- GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
-}
-
-template <armnn::DataType DataType>
-void CopyFromClToNeonWorkload<DataType>::Execute() const
-{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromClToNeonWorkload_Execute");
-
- for (const auto& pair : m_TensorHandlePairs)
- {
- IClTensorHandle& handle = *pair.first;
-
- handle.Map(true);
- pair.second->GetTensor().copy_from(handle.GetTensor());
- handle.UnMap();
- }
-}
-
-template class CopyFromClToNeonWorkload<DataType::Float32>;
-template class CopyFromClToNeonWorkload<DataType::QuantisedAsymm8>;
-
-#endif // ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED
-
-}
+} //namespace armnn
diff --git a/src/armnn/backends/MemCopyWorkload.hpp b/src/armnn/backends/MemCopyWorkload.hpp
index 7fcaf138c3..7a46e5b2ef 100644
--- a/src/armnn/backends/MemCopyWorkload.hpp
+++ b/src/armnn/backends/MemCopyWorkload.hpp
@@ -6,131 +6,21 @@
#include "CpuTensorHandleFwd.hpp"
#include "backends/Workload.hpp"
-
+#include "WorkloadUtils.hpp"
#include <utility>
namespace armnn
{
-template <armnn::DataType DataType>
-class CopyFromCpuToCpuWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType>
-{
-public:
- CopyFromCpuToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
- void Execute() const override;
-
-private:
- using TensorHandlePair = std::pair<const ConstCpuTensorHandle*, CpuTensorHandle*>;
- std::vector<TensorHandlePair> m_TensorHandlePairs;
-};
-
-using CopyFromCpuToCpuFloat32Workload = CopyFromCpuToCpuWorkload<DataType::Float32>;
-using CopyFromCpuToCpuUint8Workload = CopyFromCpuToCpuWorkload<DataType::QuantisedAsymm8>;
-
-#if ARMCOMPUTECL_ENABLED
-
-class IClTensorHandle;
-
-template <armnn::DataType DataType>
-class CopyFromCpuToClWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType>
-{
-public:
- CopyFromCpuToClWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
- void Execute() const override;
-
-private:
- using TensorHandlePair = std::pair<const ConstCpuTensorHandle*, IClTensorHandle*>;
- std::vector<TensorHandlePair> m_TensorHandlePairs;
-};
-
-using CopyFromCpuToClFloat32Workload = CopyFromCpuToClWorkload<DataType::Float32>;
-using CopyFromCpuToClUint8Workload = CopyFromCpuToClWorkload<DataType::QuantisedAsymm8>;
-
-template <armnn::DataType DataType>
-class CopyFromClToCpuWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType>
-{
-public:
- CopyFromClToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
- void Execute() const override;
-
-private:
- using TensorHandlePair = std::pair<IClTensorHandle*, CpuTensorHandle*>;
- std::vector<TensorHandlePair> m_TensorHandlePairs;
-};
-
-using CopyFromClToCpuFloat32Workload = CopyFromClToCpuWorkload<DataType::Float32>;
-using CopyFromClToCpuUint8Workload = CopyFromClToCpuWorkload<DataType::QuantisedAsymm8>;
-
-#endif // ARMCOMPUTECL_ENABLED
-
-#if ARMCOMPUTENEON_ENABLED
-
-class INeonTensorHandle;
-
-template <armnn::DataType DataType>
-class CopyFromCpuToNeonWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType>
-{
-public:
- CopyFromCpuToNeonWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
- void Execute() const override;
-
-protected:
- using TensorHandlePair = std::pair<const ConstCpuTensorHandle*, INeonTensorHandle*>;
- std::vector<TensorHandlePair> m_TensorHandlePairs;
-};
-
-using CopyFromCpuToNeonFloat32Workload = CopyFromCpuToNeonWorkload<DataType::Float32>;
-using CopyFromCpuToNeonUint8Workload = CopyFromCpuToNeonWorkload<DataType::QuantisedAsymm8>;
-
-template <armnn::DataType DataType>
-class CopyFromNeonToCpuWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType>
+class CopyMemGenericWorkload : public BaseWorkload<MemCopyQueueDescriptor>
{
public:
- CopyFromNeonToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
- void Execute() const override;
-
-protected:
- using TensorHandlePair = std::pair<const INeonTensorHandle*, CpuTensorHandle*>;
- std::vector<TensorHandlePair> m_TensorHandlePairs;
-};
-
-using CopyFromNeonToCpuFloat32Workload = CopyFromNeonToCpuWorkload<DataType::Float32>;
-using CopyFromNeonToCpuUint8Workload = CopyFromNeonToCpuWorkload<DataType::QuantisedAsymm8>;
-
-#endif
-
-#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED
-
-template <armnn::DataType DataType>
-class CopyFromNeonToClWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType>
-{
-public:
- CopyFromNeonToClWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
+ CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
void Execute() const override;
private:
- using TensorHandlePair = std::pair<const INeonTensorHandle*, IClTensorHandle*>;
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
std::vector<TensorHandlePair> m_TensorHandlePairs;
};
-using CopyFromNeonToClFloat32Workload = CopyFromNeonToClWorkload<DataType::Float32>;
-using CopyFromNeonToClUint8Workload = CopyFromNeonToClWorkload<DataType::QuantisedAsymm8>;
-
-template <armnn::DataType DataType>
-class CopyFromClToNeonWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType>
-{
-public:
- CopyFromClToNeonWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
- void Execute() const override;
-
-private:
- using TensorHandlePair = std::pair<IClTensorHandle*, INeonTensorHandle*>;
- std::vector<TensorHandlePair> m_TensorHandlePairs;
-};
-
-using CopyFromClToNeonFloat32Workload = CopyFromClToNeonWorkload<DataType::Float32>;
-using CopyFromClToNeonUint8Workload = CopyFromClToNeonWorkload<DataType::QuantisedAsymm8>;
-
-#endif
-
-}
+} //namespace armnn
diff --git a/src/armnn/backends/NeonLayerSupport.cpp b/src/armnn/backends/NeonLayerSupport.cpp
index bfc84bd086..3aef4e60aa 100644
--- a/src/armnn/backends/NeonLayerSupport.cpp
+++ b/src/armnn/backends/NeonLayerSupport.cpp
@@ -15,34 +15,29 @@
#include <boost/core/ignore_unused.hpp>
#ifdef ARMCOMPUTENEON_ENABLED
+#include "NeonWorkloads/NeonAdditionFloat32Workload.hpp"
+#include "NeonWorkloads/NeonActivationFloat32Workload.hpp"
+#include "NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp"
#include "NeonWorkloads/NeonConvolution2dBaseWorkload.hpp"
-#include "NeonWorkloads/NeonPooling2dBaseWorkload.hpp"
+#include "NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp"
+#include "NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp"
+#include "NeonWorkloads/NeonMultiplicationFloat32Workload.hpp"
+#include "NeonWorkloads/NeonNormalizationFloat32Workload.hpp"
+#include "NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp"
#include "NeonWorkloads/NeonPermuteWorkload.hpp"
+#include "NeonWorkloads/NeonPooling2dBaseWorkload.hpp"
+#include "NeonWorkloads/NeonSoftmaxBaseWorkload.hpp"
#endif
using namespace boost;
namespace armnn
{
-bool IsNeonActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters)
-{
- if (parameters.m_Function != ActivationFunction::BoundedReLu)
- {
- if (reasonIfUnsupported)
- {
- *reasonIfUnsupported = "Unsupported activation function, only BoundedReLu is supported)";
- }
-
- return false;
- }
-
- return true;
-}
bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc)
{
// See arm_compute::NEDirectConvolutionLayer documentation for the supported cases,
- // and complement with NEDirectConvolutionLayerKernel::configure() implementation
+ // and complement with NEDirectConvolutionLayerKernel::configure() implementation.
// Only 1x1 is using direct convolution. Performance results and details are in:
// https://jira.arm.com/browse/IVGCVSW-1003
@@ -60,15 +55,15 @@ bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convol
conv2ddesc.m_PadTop > value || conv2ddesc.m_PadBottom > value;
};
- // Supported sizes and padding
+ // Supported sizes and padding.
const bool sizeAndPaddingSupported =
- // Pad > 0 not supported for 1x1 weights
+ // Pad > 0 not supported for 1x1 weights.
(weightInfo.GetShape()[2] == 1 && weightInfo.GetShape()[3] == 1 && !paddingLargerThan(desc, 0u));
const bool preferDirectConvolution = dataTypeSupported &&
strideSupported &&
sizeAndPaddingSupported &&
- // NEDirectConvolutionLayerKernel doesn't support NULL bias
+ // NEDirectConvolutionLayerKernel doesn't support NULL bias.
desc.m_BiasEnabled;
return preferDirectConvolution;
}
@@ -108,10 +103,10 @@ bool IsNeonBackendSupported(std::string* reasonIfUnsupported)
#endif
}
-template<typename Float32Func, typename Uint8Func, typename ... Params>
+template<typename FloatFunc, typename Uint8Func, typename ... Params>
bool IsSupportedForDataTypeNeon(std::string* reasonIfUnsupported,
DataType dataType,
- Float32Func floatFuncPtr,
+ FloatFunc floatFuncPtr,
Uint8Func uint8FuncPtr,
Params&&... params)
{
@@ -119,6 +114,7 @@ bool IsSupportedForDataTypeNeon(std::string* reasonIfUnsupported,
IsSupportedForDataTypeGeneric(reasonIfUnsupported,
dataType,
floatFuncPtr,
+ floatFuncPtr,
uint8FuncPtr,
std::forward<Params>(params)...);
}
@@ -144,43 +140,16 @@ inline bool IsWorkloadSupported(FuncType& func, std::string* reasonIfUnsupported
#endif
bool IsActivationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
const ActivationDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
ignore_unused(descriptor);
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<const ActivationDescriptor&>,
- &IsNeonActivationUint8Supported,
- descriptor);
-}
-
-bool IsNeonDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported,
- const DepthwiseConvolution2dDescriptor& parameters,
- const TensorInfo& weights)
-{
- ignore_unused(weights);
-
- if (parameters.m_StrideX < 1 || parameters.m_StrideX > 3)
- {
- if (reasonIfUnsupported)
- {
- *reasonIfUnsupported = "m_StrideX can only be 1, 2 or 3";
- }
- return false;
- }
-
- // weights.GetShape()[0] = channel multiplier
- if (weights.GetShape()[0] != 1)
- {
- if (reasonIfUnsupported)
- {
- *reasonIfUnsupported = "Channel multiplier only supports the value 1 in the NEON backend";
- }
- return false;
- }
-
- return true;
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonActivationWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor);
}
bool IsAdditionSupportedNeon(const TensorInfo& input0,
@@ -188,23 +157,31 @@ bool IsAdditionSupportedNeon(const TensorInfo& input0,
const TensorInfo& output,
std::string* reasonIfUnsupported)
{
- ignore_unused(input1);
- ignore_unused(output);
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input0.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonAdditionWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
}
bool IsBatchNormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
const BatchNormalizationDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
- ignore_unused(descriptor);
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonBatchNormalizationValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ mean,
+ var,
+ beta,
+ gamma,
+ descriptor);
}
bool IsConstantSupportedNeon(const TensorInfo& output,
@@ -233,27 +210,40 @@ bool IsConvolution2dSupportedNeon(const TensorInfo& input,
}
bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
const DepthwiseConvolution2dDescriptor& descriptor,
const TensorInfo& weights,
+ const TensorInfo& biases,
std::string* reasonIfUnsupported)
{
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &IsNeonDepthwiseConvolution2dDescParamsSupported,
- &IsNeonDepthwiseConvolution2dDescParamsSupported,
- descriptor,
- weights);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonDepthwiseConvolutionWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor,
+ weights,
+ biases);
}
bool IsFullyConnectedSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
const FullyConnectedDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
- ignore_unused(descriptor);
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ // At the moment U8 is unsupported
+ if (input.GetDataType() == DataType::QuantisedAsymm8)
+ {
+ return false;
+ }
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonFullyConnectedWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ weights,
+ biases,
+ descriptor);
}
bool IsInputSupportedNeon(const TensorInfo& input,
@@ -266,12 +256,10 @@ bool IsInputSupportedNeon(const TensorInfo& input,
}
bool IsL2NormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
std::string* reasonIfUnsupported)
{
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &FalseFunc<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output);
}
bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs,
@@ -287,13 +275,14 @@ bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs,
bool IsMultiplicationSupportedNeon(const TensorInfo& input0,
const TensorInfo& input1,
+ const TensorInfo& output,
std::string* reasonIfUnsupported)
{
- ignore_unused(input1);
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input0.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonMultiplicationWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
}
bool IsNormalizationSupportedNeon(const TensorInfo& input,
@@ -301,11 +290,7 @@ bool IsNormalizationSupportedNeon(const TensorInfo& input,
const NormalizationDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &IsNeonNormalizationDescParamsSupported,
- &FalseFuncU8<const NormalizationDescriptor&>,
- descriptor);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonNormalizationWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
}
bool IsOutputSupportedNeon(const TensorInfo& output,
@@ -341,14 +326,11 @@ bool IsResizeBilinearSupportedNeon(const TensorInfo& input,
}
bool IsSoftmaxSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
const SoftmaxDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
- ignore_unused(descriptor);
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &TrueFunc<>);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonSoftmaxWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
}
bool IsSplitterSupportedNeon(const TensorInfo& input,
@@ -385,10 +367,72 @@ bool IsFloorSupportedNeon(const TensorInfo& input,
std::string* reasonIfUnsupported)
{
ignore_unused(output);
- return IsSupportedForDataTypeNeon(reasonIfUnsupported,
- input.GetDataType(),
- &TrueFunc<>,
- &FalseFuncU8<>);
+ return IsNeonBackendSupported(reasonIfUnsupported) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ input.GetDataType(),
+ &FalseFuncF16<>,
+ &TrueFunc<>,
+ &FalseFuncU8<>);
+}
+
+bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(outputStateIn);
+ ignore_unused(cellStateIn);
+ ignore_unused(scratchBuffer);
+ ignore_unused(outputStateOut);
+ ignore_unused(cellStateOut);
+ ignore_unused(output);
+ ignore_unused(descriptor);
+ ignore_unused(inputToForgetWeights);
+ ignore_unused(inputToCellWeights);
+ ignore_unused(inputToOutputWeights);
+ ignore_unused(recurrentToForgetWeights);
+ ignore_unused(recurrentToCellWeights);
+ ignore_unused(recurrentToOutputWeights);
+ ignore_unused(forgetGateBias);
+ ignore_unused(cellBias);
+ ignore_unused(outputGateBias);
+ ignore_unused(inputToInputWeights);
+ ignore_unused(recurrentToInputWeights);
+ ignore_unused(cellToInputWeights);
+ ignore_unused(inputGateBias);
+ ignore_unused(projectionWeights);
+ ignore_unused(projectionBias);
+ ignore_unused(cellToForgetWeights);
+ ignore_unused(cellToOutputWeights);
+ return false;
+}
+
+bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(output);
+ return true;
+}
+
+bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(output);
+ return true;
}
}
diff --git a/src/armnn/backends/NeonLayerSupport.hpp b/src/armnn/backends/NeonLayerSupport.hpp
index ce2ecec459..6f9fe9c20e 100644
--- a/src/armnn/backends/NeonLayerSupport.hpp
+++ b/src/armnn/backends/NeonLayerSupport.hpp
@@ -11,14 +11,13 @@
namespace armnn
{
-bool IsNeonActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters);
-
bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc);
bool IsNeonNormalizationDescParamsSupported(std::string* reasonIfUnsupported,
const NormalizationDescriptor& parameters);
bool IsActivationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
const ActivationDescriptor& descriptor,
std::string* reasonIfUnsupported);
@@ -32,6 +31,11 @@ bool IsAdditionSupportedNeon(const TensorInfo& input0,
std::string* reasonIfUnsupported);
bool IsBatchNormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
const BatchNormalizationDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -45,12 +49,18 @@ bool IsConvolution2dSupportedNeon(const TensorInfo& input,
const TensorInfo& biases,
std::string* reasonIfUnsupported = nullptr);
+
bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
const DepthwiseConvolution2dDescriptor& descriptor,
const TensorInfo& weights,
+ const TensorInfo& biases,
std::string* reasonIfUnsupported = nullptr);
bool IsFullyConnectedSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
const FullyConnectedDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -58,6 +68,7 @@ bool IsInputSupportedNeon(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsL2NormalizationSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs,
@@ -66,6 +77,7 @@ bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs,
bool IsMultiplicationSupportedNeon(const TensorInfo& input0,
const TensorInfo& input1,
+ const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
bool IsNormalizationSupportedNeon(const TensorInfo& input,
@@ -90,6 +102,7 @@ bool IsResizeBilinearSupportedNeon(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsSoftmaxSupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
const SoftmaxDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -108,4 +121,26 @@ bool IsFloorSupportedNeon(const TensorInfo& input,
const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
+bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
}
diff --git a/src/armnn/backends/NeonTensorHandle.hpp b/src/armnn/backends/NeonTensorHandle.hpp
index 684a5e1bfc..3818d2c9b2 100644
--- a/src/armnn/backends/NeonTensorHandle.hpp
+++ b/src/armnn/backends/NeonTensorHandle.hpp
@@ -7,11 +7,14 @@
#include "OutputHandler.hpp"
#include "ArmComputeTensorUtils.hpp"
+#include <arm_compute/runtime/MemoryGroup.h>
+#include <arm_compute/runtime/IMemoryGroup.h>
#include <arm_compute/runtime/Tensor.h>
#include <arm_compute/runtime/SubTensor.h>
#include <arm_compute/core/TensorShape.h>
#include <arm_compute/core/Coordinates.h>
+#include <boost/polymorphic_pointer_cast.hpp>
namespace armnn
{
@@ -22,6 +25,7 @@ public:
virtual arm_compute::ITensor& GetTensor() = 0;
virtual arm_compute::ITensor const& GetTensor() const = 0;
virtual arm_compute::DataType GetDataType() const = 0;
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) = 0;
};
class NeonTensorHandle : public INeonTensorHandle
@@ -34,47 +38,100 @@ public:
arm_compute::ITensor& GetTensor() override { return m_Tensor; }
arm_compute::ITensor const& GetTensor() const override { return m_Tensor; }
+
virtual void Allocate() override
{
armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);
};
+ virtual void Manage() override
+ {
+ BOOST_ASSERT(m_MemoryGroup != nullptr);
+ m_MemoryGroup->manage(&m_Tensor);
+ }
+
virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; }
+ virtual ITensorHandle* GetParent() const override { return nullptr; }
+
virtual arm_compute::DataType GetDataType() const override
{
return m_Tensor.info()->data_type();
}
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override
+ {
+ m_MemoryGroup = boost::polymorphic_pointer_downcast<arm_compute::MemoryGroup>(memoryGroup);
+ }
+
+ virtual const void* Map(bool /* blocking = true */) const override
+ {
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override {}
+
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+
private:
arm_compute::Tensor m_Tensor;
+ std::shared_ptr<arm_compute::MemoryGroup> m_MemoryGroup;
};
class NeonSubTensorHandle : public INeonTensorHandle
{
public:
- NeonSubTensorHandle(arm_compute::ITensor& parent,
- const arm_compute::TensorShape& shape,
- const arm_compute::Coordinates& coords)
- : m_Tensor(&parent, shape, coords)
+ NeonSubTensorHandle(INeonTensorHandle* parent,
+ const arm_compute::TensorShape& shape,
+ const arm_compute::Coordinates& coords)
+ : m_Tensor(&parent->GetTensor(), shape, coords)
{
+ parentHandle = parent;
}
arm_compute::ITensor& GetTensor() override { return m_Tensor; }
arm_compute::ITensor const& GetTensor() const override { return m_Tensor; }
- virtual void Allocate() override
- {
- };
+
+ virtual void Allocate() override {}
+ virtual void Manage() override {}
virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; }
+ virtual ITensorHandle* GetParent() const override { return parentHandle; }
+
virtual arm_compute::DataType GetDataType() const override
{
return m_Tensor.info()->data_type();
}
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+ virtual const void* Map(bool /* blocking = true */) const override
+ {
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override {}
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
private:
- arm_compute::SubTensor m_Tensor;
+ arm_compute::SubTensor m_Tensor;
+ ITensorHandle* parentHandle = nullptr;
};
}
diff --git a/src/armnn/backends/NeonWorkloadFactory.cpp b/src/armnn/backends/NeonWorkloadFactory.cpp
index a17988de5a..6ea72f77cc 100644
--- a/src/armnn/backends/NeonWorkloadFactory.cpp
+++ b/src/armnn/backends/NeonWorkloadFactory.cpp
@@ -9,10 +9,13 @@
#ifdef ARMCOMPUTENEON_ENABLED
#include "arm_compute/runtime/Allocator.h"
+
#include "MemCopyWorkload.hpp"
#include "NeonTensorHandle.hpp"
#include "NeonWorkloadUtils.hpp"
#include "NeonWorkloads.hpp"
+
+#include "memory/IPoolManager.hpp"
#endif
#include "MakeWorkloadHelper.hpp"
@@ -22,7 +25,8 @@
namespace armnn
{
-bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported)
+bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported)
{
return IWorkloadFactory::IsLayerSupported(Compute::CpuAcc, layer, dataType, outReasonIfUnsupported);
}
@@ -30,7 +34,7 @@ bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType
#ifdef ARMCOMPUTENEON_ENABLED
NeonWorkloadFactory::NeonWorkloadFactory()
-: m_MemoryManager(std::make_unique<arm_compute::Allocator>())
+ : m_MemoryManager(std::make_unique<arm_compute::Allocator>(), BaseMemoryManager::MemoryAffinity::Offset)
{
}
@@ -46,30 +50,33 @@ std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateSubTensorHandle(ITenso
coords.set_num_dimensions(subTensorShape.GetNumDimensions());
for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++)
{
- // arm compute indexes tensor coords in reverse order
+ // Arm compute indexes tensor coords in reverse order.
unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1;
coords.set(i, boost::numeric_cast<int>(subTensorOrigin[revertedIndex]));
}
- return std::make_unique<NeonSubTensorHandle>(boost::polymorphic_downcast<INeonTensorHandle*>(&parent)->GetTensor(),
- shape, coords);
+ return std::make_unique<NeonSubTensorHandle>(
+ boost::polymorphic_downcast<INeonTensorHandle*>(&parent), shape, coords);
}
std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
{
- return std::make_unique<NeonTensorHandle>(tensorInfo);
+ auto tensorHandle = std::make_unique<NeonTensorHandle>(tensorInfo);
+ tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup());
+
+ return tensorHandle;
}
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<CopyFromCpuToNeonFloat32Workload, CopyFromCpuToNeonUint8Workload>(descriptor, info);
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<CopyFromNeonToCpuFloat32Workload, CopyFromNeonToCpuUint8Workload>(descriptor, info);
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
@@ -82,7 +89,7 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSoftmax(const SoftmaxQueue
const WorkloadInfo& info) const
{
return MakeWorkload<NeonSoftmaxFloat32Workload, NeonSoftmaxUint8Workload>(descriptor, info,
- m_MemoryManager.Get());
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
@@ -100,13 +107,14 @@ std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMerger(const Merger
std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateFullyConnected(
const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const
{
- return MakeWorkload<NeonFullyConnectedFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get());
+ return MakeWorkload<NeonFullyConnectedFloat32Workload, NullWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<NeonPermuteFloat32Workload, NeonPermuteUint8Workload>(descriptor, info);
+ return MakeWorkload<NeonPermuteFloatWorkload, NeonPermuteUint8Workload>(descriptor, info);
}
std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
@@ -119,7 +127,7 @@ std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateConvolution2d(
const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const
{
return MakeWorkload<NeonConvolution2dFloat32Workload, NeonConvolution2dUint8Workload>(descriptor, info,
- m_MemoryManager.Get());
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDepthwiseConvolution2d(
@@ -132,7 +140,8 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDepthwiseConvolution2d(
std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateNormalization(
const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
{
- return MakeWorkload<NeonNormalizationFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get());
+ return MakeWorkload<NeonNormalizationFloat32Workload, NullWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
@@ -161,21 +170,7 @@ std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMemCopy(const MemCo
throw InvalidArgumentException("NeonWorkloadFactory: Invalid null input for MemCopy workload");
}
- // Create a workload that will copy tensor data from the inputs, which can have a number of different formats,
- // to Neon tensors.
- switch (descriptor.m_Inputs[0]->GetType())
- {
- case ITensorHandle::Cpu:
- return MakeWorkload<CopyFromCpuToNeonFloat32Workload, CopyFromCpuToNeonUint8Workload>(descriptor, info);
-#if ARMCOMPUTECL_ENABLED
- case ITensorHandle::CL:
- {
- return MakeWorkload<CopyFromClToNeonFloat32Workload, CopyFromClToNeonUint8Workload>(descriptor, info);
- }
-#endif
- default:
- throw InvalidArgumentException("NeonWorkloadFactory: Destination type not supported for MemCopy Workload.");
- }
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateResizeBilinear(
@@ -195,7 +190,8 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFakeQuantization(
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
const WorkloadInfo& info) const
{
- return MakeWorkload<NeonL2NormalizationFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get());
+ return MakeWorkload<NeonL2NormalizationFloat32Workload, NullWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
}
std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
@@ -216,11 +212,41 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDesc
return MakeWorkload<NeonFloorFloat32Workload, NullWorkload>(descriptor, info);
}
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NeonLstmFloat32Workload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<NeonConvertFp32ToFp16Workload>(descriptor, info);
+}
+
void NeonWorkloadFactory::Finalize()
{
m_MemoryManager.Finalize();
}
+void NeonWorkloadFactory::Release()
+{
+ m_MemoryManager.Release();
+}
+
+void NeonWorkloadFactory::Acquire()
+{
+ m_MemoryManager.Acquire();
+}
+
#else // Compiled without ArmCompute libs
NeonWorkloadFactory::NeonWorkloadFactory()
@@ -371,9 +397,35 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDesc
return nullptr;
}
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
void NeonWorkloadFactory::Finalize()
{}
+void NeonWorkloadFactory::Release()
+{}
+
+void NeonWorkloadFactory::Acquire()
+{}
+
#endif
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloadFactory.hpp b/src/armnn/backends/NeonWorkloadFactory.hpp
index 66a69f3baf..83e1f5e75f 100644
--- a/src/armnn/backends/NeonWorkloadFactory.hpp
+++ b/src/armnn/backends/NeonWorkloadFactory.hpp
@@ -4,15 +4,17 @@
//
#pragma once
-#include "AclBaseMemoryManager.hpp"
#include "OutputHandler.hpp"
+#include "memory/BaseMemoryManager.hpp"
+
#include <boost/core/ignore_unused.hpp>
+#include <boost/optional.hpp>
namespace armnn
{
-// Neon workload factory
+// Neon workload factory.
class NeonWorkloadFactory : public IWorkloadFactory
{
public:
@@ -20,7 +22,8 @@ public:
virtual Compute GetCompute() const override { return Compute::CpuAcc; }
- static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported);
+ static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported);
virtual bool SupportsSubTensors() const override { return true; }
@@ -96,11 +99,25 @@ public:
virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
- void Finalize() override;
+ virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
-private:
+ virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual void Finalize() override;
- mutable AclBaseMemoryManager m_MemoryManager;
+ virtual void Release() override;
+
+ virtual void Acquire() override;
+
+private:
+#ifdef ARMCOMPUTENEON_ENABLED
+ mutable NeonMemoryManager m_MemoryManager;
+#endif
};
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloadUtils.cpp b/src/armnn/backends/NeonWorkloadUtils.cpp
index e807d23d6c..07e5d510eb 100644
--- a/src/armnn/backends/NeonWorkloadUtils.cpp
+++ b/src/armnn/backends/NeonWorkloadUtils.cpp
@@ -20,13 +20,14 @@
#include "NeonLayerSupport.hpp"
#include "../../../include/armnn/Types.hpp"
+#include "Half.hpp"
using namespace armnn::armcomputetensorutils;
namespace armnn
{
-// Allocate a tensor and copy the contents in data to the tensor contents
+// Allocates a tensor and copy the contents in data to the tensor contents.
template<typename T>
void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data)
{
@@ -34,8 +35,26 @@ void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data)
CopyArmComputeITensorData(data, tensor);
}
+template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const Half* data);
template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const float* data);
template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const uint8_t* data);
template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const int32_t* data);
+void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor,
+ const ConstCpuTensorHandle* handle)
+{
+ BOOST_ASSERT(handle);
+ switch(handle->GetTensorInfo().GetDataType())
+ {
+ case DataType::Float16:
+ InitialiseArmComputeTensorData(tensor, handle->GetConstTensor<Half>());
+ break;
+ case DataType::Float32:
+ InitialiseArmComputeTensorData(tensor, handle->GetConstTensor<float>());
+ break;
+ default:
+ BOOST_ASSERT_MSG(false, "Unexpected floating point type.");
+ }
+};
+
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloadUtils.hpp b/src/armnn/backends/NeonWorkloadUtils.hpp
index ec7688237a..8169f8636a 100644
--- a/src/armnn/backends/NeonWorkloadUtils.hpp
+++ b/src/armnn/backends/NeonWorkloadUtils.hpp
@@ -7,6 +7,7 @@
#include "Workload.hpp"
#include "backends/NeonTensorHandle.hpp"
+#include "NeonTimer.hpp"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Helpers.h"
@@ -22,4 +23,12 @@ class Layer;
template<typename T>
void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data);
+void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor, const ConstCpuTensorHandle* handle);
} //namespace armnn
+
+
+#define ARMNN_SCOPED_PROFILING_EVENT_NEON(name) \
+ ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::CpuAcc, \
+ name, \
+ armnn::WallClockTimer(), \
+ armnn::NeonTimer())
diff --git a/src/armnn/backends/NeonWorkloads.hpp b/src/armnn/backends/NeonWorkloads.hpp
index 83a3e9fd9b..9619b4e5c9 100644
--- a/src/armnn/backends/NeonWorkloads.hpp
+++ b/src/armnn/backends/NeonWorkloads.hpp
@@ -13,6 +13,8 @@
#include "backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp"
#include "backends/NeonWorkloads/NeonConstantFloat32Workload.hpp"
#include "backends/NeonWorkloads/NeonConstantUint8Workload.hpp"
+#include "backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp"
+#include "backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp"
#include "backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp"
#include "backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp"
#include "backends/NeonWorkloads/NeonConvolution2dUint8Workload.hpp"
@@ -21,6 +23,7 @@
#include "backends/NeonWorkloads/NeonFloorFloat32Workload.hpp"
#include "backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp"
#include "backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp"
+#include "backends/NeonWorkloads/NeonLstmFloat32Workload.hpp"
#include "backends/NeonWorkloads/NeonMergerFloat32Workload.hpp"
#include "backends/NeonWorkloads/NeonMergerUint8Workload.hpp"
#include "backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp"
diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp
index 39e55d5761..711bfceeaf 100644
--- a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp
@@ -9,9 +9,32 @@
namespace armnn
{
+
+arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ const arm_compute::ActivationLayerInfo activationLayerInfo =
+ ConvertActivationDescriptorToAclActivationLayerInfo(descriptor);
+
+ if (input.GetDataType() == DataType::QuantisedAsymm8 &&
+ activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR,
+ "Neon: Logistic Activations unsupported with QAsymm8 data type."};
+ }
+
+ return arm_compute::NEActivationLayer::validate(&aclInput,
+ &aclOutput,
+ activationLayerInfo);
+}
+
NeonActivationFloat32Workload::NeonActivationFloat32Workload(const ActivationQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<ActivationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<ActivationQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("NeonActivationFloat32Workload", 1, 1);
@@ -26,7 +49,7 @@ NeonActivationFloat32Workload::NeonActivationFloat32Workload(const ActivationQue
void NeonActivationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonActivationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationFloat32Workload_Execute");
m_ActivationLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp
index 6fa83ea2f6..0d26b3b39f 100644
--- a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp
@@ -9,7 +9,12 @@
namespace armnn
{
-class NeonActivationFloat32Workload : public Float32Workload<ActivationQueueDescriptor>
+
+arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor);
+
+class NeonActivationFloat32Workload : public FloatWorkload<ActivationQueueDescriptor>
{
public:
NeonActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp
index 27c37e9425..f2e42338b2 100644
--- a/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp
@@ -13,15 +13,8 @@ NeonActivationUint8Workload::NeonActivationUint8Workload(const ActivationQueueDe
const WorkloadInfo& info)
: Uint8Workload<ActivationQueueDescriptor>(descriptor, info)
{
-
- std::string reasonIfUnsupported;
- if (!IsNeonActivationUint8Supported(&reasonIfUnsupported, m_Data.m_Parameters))
- {
- throw InvalidArgumentException(reasonIfUnsupported);
- }
-
- // Only BoundedReLu is supported (see IsNeonActivationUint8Supported)
- arm_compute::ActivationLayerInfo layerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function);
+ arm_compute::ActivationLayerInfo layerInfo(activation,
m_Data.m_Parameters.m_A,
m_Data.m_Parameters.m_B);
@@ -35,7 +28,7 @@ NeonActivationUint8Workload::NeonActivationUint8Workload(const ActivationQueueDe
void NeonActivationUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonActivationUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationUint8Workload_Execute");
m_ActivationLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp
index d1fb64093d..f26e42aff9 100644
--- a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp
@@ -4,14 +4,30 @@
//
#include "NeonAdditionFloat32Workload.hpp"
+#include "backends/ArmComputeTensorUtils.hpp"
#include "backends/CpuTensorHandle.hpp"
namespace armnn
{
+arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::NEArithmeticAddition::validate(&aclInput0,
+ &aclInput1,
+ &aclOutput,
+ arm_compute::ConvertPolicy::SATURATE);
+}
+
+
NeonAdditionFloat32Workload::NeonAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<AdditionQueueDescriptor>(descriptor, info)
+ : FloatWorkload<AdditionQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("NeonAdditionFloat32Workload", 2, 1);
@@ -24,7 +40,7 @@ NeonAdditionFloat32Workload::NeonAdditionFloat32Workload(const AdditionQueueDesc
void NeonAdditionFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonAdditionFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonAdditionFloat32Workload_Execute");
m_AddLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp
index 5b75b502a3..dae66bb69d 100644
--- a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp
@@ -9,7 +9,12 @@
namespace armnn
{
-class NeonAdditionFloat32Workload : public Float32Workload<AdditionQueueDescriptor>
+
+arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
+
+class NeonAdditionFloat32Workload : public FloatWorkload<AdditionQueueDescriptor>
{
public:
NeonAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp
index 247ebfc5dd..e0ad408424 100644
--- a/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp
@@ -5,23 +5,27 @@
#pragma once
+#include <arm_compute/core/Types.h>
#include <backends/ArmComputeTensorUtils.hpp>
#include <backends/CpuTensorHandle.hpp>
#include <backends/NeonTensorHandle.hpp>
+#include <backends/NeonWorkloadUtils.hpp>
#include <backends/Workload.hpp>
+#include <Half.hpp>
#include <boost/cast.hpp>
+#include "Half.hpp"
namespace armnn
{
-// Base class template providing an implementation of the Constant layer common to all data types
-template <armnn::DataType DataFormat>
-class NeonBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataFormat>
+// Base class template providing an implementation of the Constant layer common to all data types.
+template <armnn::DataType... DataFormats>
+class NeonBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataFormats...>
{
public:
NeonBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
- : TypedWorkload<ConstantQueueDescriptor, DataFormat>(descriptor, info)
+ : TypedWorkload<ConstantQueueDescriptor, DataFormats...>(descriptor, info)
, m_RanOnce(false)
{
}
@@ -41,15 +45,22 @@ public:
BOOST_ASSERT(data.m_LayerOutput != nullptr);
arm_compute::ITensor& output =
boost::polymorphic_downcast<NeonTensorHandle*>(data.m_Outputs[0])->GetTensor();
+ arm_compute::DataType computeDataType =
+ boost::polymorphic_downcast<NeonTensorHandle*>(data.m_Outputs[0])->GetDataType();
- switch (DataFormat)
+ switch (computeDataType)
{
- case DataType::Float32:
+ case arm_compute::DataType::F16:
+ {
+ CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<Half>(), output);
+ break;
+ }
+ case arm_compute::DataType::F32:
{
CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<float>(), output);
break;
}
- case DataType::QuantisedAsymm8:
+ case arm_compute::DataType::QASYMM8:
{
CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<uint8_t>(), output);
break;
diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp
index 24640c7adb..6a87d62320 100644
--- a/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp
@@ -5,20 +5,21 @@
#pragma once
+#include <backends/NeonWorkloadUtils.hpp>
#include <backends/Workload.hpp>
namespace armnn
{
-// Base class template providing an implementation of the Merger layer common to all data types
-template <armnn::DataType DataType>
-class NeonBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataType>
+// Base class template providing an implementation of the Merger layer common to all data types.
+template <armnn::DataType... DataTypes>
+class NeonBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataTypes...>
{
public:
- using TypedWorkload<MergerQueueDescriptor, DataType>::TypedWorkload;
+ using TypedWorkload<MergerQueueDescriptor, DataTypes...>::TypedWorkload;
virtual void Execute() const override
{
- // With subtensors, merger is a no-op
+ // With subtensors, merger is a no-op.
}
};
diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp
index 769905b48b..769291c700 100644
--- a/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp
@@ -6,20 +6,21 @@
#pragma once
#include <backends/Workload.hpp>
+#include <backends/NeonWorkloadUtils.hpp>
namespace armnn
{
-// Base class template providing an implementation of the Splitter layer common to all data types
-template <armnn::DataType DataType>
-class NeonBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataType>
+// Base class template providing an implementation of the Splitter layer common to all data types.
+template <armnn::DataType... DataTypes>
+class NeonBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataTypes...>
{
public:
- using TypedWorkload<SplitterQueueDescriptor, DataType>::TypedWorkload;
+ using TypedWorkload<SplitterQueueDescriptor, DataTypes...>::TypedWorkload;
virtual void Execute() const override
{
- // With subtensors, splitter is a no-op
+ // With subtensors, splitter is a no-op.
}
};
diff --git a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp
index f107c8137f..ca5c8202cd 100644
--- a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp
@@ -6,40 +6,91 @@
#include "NeonBatchNormalizationFloat32Workload.hpp"
#include "backends/CpuTensorHandle.hpp"
#include "backends/ArmComputeTensorUtils.hpp"
+#include "../../../../include/armnn/ArmNN.hpp"
namespace armnn
{
using namespace armcomputetensorutils;
+
+arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean);
+ const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var);
+ const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta);
+ const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma);
+
+ return arm_compute::NEBatchNormalizationLayer::validate(&aclInputInfo,
+ &aclOutputInfo,
+ &aclMeanInfo,
+ &aclVarInfo,
+ &aclBetaInfo,
+ &aclGammaInfo,
+ descriptor.m_Eps);
+}
+
NeonBatchNormalizationFloat32Workload::NeonBatchNormalizationFloat32Workload(
const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
- : Float32Workload<BatchNormalizationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("NeonBatchNormalizationFloat32Workload", 1, 1);
arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
- BuildArmComputeTensor(m_Mean, m_Data.m_Mean->GetTensorInfo());
- BuildArmComputeTensor(m_Variance, m_Data.m_Variance->GetTensorInfo());
- BuildArmComputeTensor(m_Gamma, m_Data.m_Gamma->GetTensorInfo());
- BuildArmComputeTensor(m_Beta, m_Data.m_Beta->GetTensorInfo());
+ m_Mean = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo());
+
+ m_Variance = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo());
- m_Layer.configure(
- &input, &output, &m_Mean, &m_Variance, &m_Beta, &m_Gamma, m_Data.m_Parameters.m_Eps);
+ m_Gamma = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo());
- InitialiseArmComputeTensorData(m_Mean, m_Data.m_Mean->GetConstTensor<float>());
- InitialiseArmComputeTensorData(m_Variance, m_Data.m_Variance->GetConstTensor<float>());
- InitialiseArmComputeTensorData(m_Gamma, m_Data.m_Gamma->GetConstTensor<float>());
- InitialiseArmComputeTensorData(m_Beta, m_Data.m_Beta->GetConstTensor<float>());
+ m_Beta = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo());
+
+ m_Layer.configure(&input,
+ &output,
+ m_Mean.get(),
+ m_Variance.get(),
+ m_Beta.get(),
+ m_Gamma.get(),
+ m_Data.m_Parameters.m_Eps);
+
+ InitializeArmComputeTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean);
+ InitializeArmComputeTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance);
+ InitializeArmComputeTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma);
+ InitializeArmComputeTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta);
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_Layer.prepare();
+ FreeUnusedTensors();
}
void NeonBatchNormalizationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonBatchNormalizationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonBatchNormalizationFloat32Workload_Execute");
m_Layer.run();
}
+void NeonBatchNormalizationFloat32Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_Mean);
+ FreeTensorIfUnused(m_Variance);
+ FreeTensorIfUnused(m_Gamma);
+ FreeTensorIfUnused(m_Beta);
+}
+
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp
index 2050d42859..5eb5601f26 100644
--- a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp
@@ -10,7 +10,15 @@
namespace armnn
{
-class NeonBatchNormalizationFloat32Workload : public Float32Workload<BatchNormalizationQueueDescriptor>
+arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor);
+
+class NeonBatchNormalizationFloat32Workload : public FloatWorkload<BatchNormalizationQueueDescriptor>
{
public:
NeonBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor,
@@ -20,10 +28,12 @@ public:
private:
mutable arm_compute::NEBatchNormalizationLayer m_Layer;
- arm_compute::Tensor m_Mean;
- arm_compute::Tensor m_Variance;
- arm_compute::Tensor m_Gamma;
- arm_compute::Tensor m_Beta;
+ std::unique_ptr<arm_compute::Tensor> m_Mean;
+ std::unique_ptr<arm_compute::Tensor> m_Variance;
+ std::unique_ptr<arm_compute::Tensor> m_Gamma;
+ std::unique_ptr<arm_compute::Tensor> m_Beta;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp
index 8b203fbf3a..4e5d570a8e 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void NeonConstantFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConstantFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantFloat32Workload_Execute");
NeonBaseConstantWorkload::Execute();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp
index 4ea4dfe127..050954df24 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp
@@ -10,10 +10,10 @@
namespace armnn
{
-class NeonConstantFloat32Workload : public NeonBaseConstantWorkload<DataType::Float32>
+class NeonConstantFloat32Workload : public NeonBaseConstantWorkload<DataType::Float16, DataType::Float32>
{
public:
- using NeonBaseConstantWorkload<DataType::Float32>::NeonBaseConstantWorkload;
+ using NeonBaseConstantWorkload<DataType::Float16, DataType::Float32>::NeonBaseConstantWorkload;
virtual void Execute() const override;
};
diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp
index f6dfaeb7a7..4061605bc1 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void NeonConstantUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConstantUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantUint8Workload_Execute");
NeonBaseConstantWorkload::Execute();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp
new file mode 100644
index 0000000000..84fc051f65
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp
@@ -0,0 +1,41 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "NeonConvertFp16ToFp32Workload.hpp"
+#include "Half.hpp"
+#include "FloatingPointConverter.hpp"
+
+#include "backends/WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+NeonConvertFp16ToFp32Workload::NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertFp16ToFp32Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertFp16ToFp32Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp16ToFp32Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const Half*>(src);
+ auto output = reinterpret_cast<float*>(dst);
+ size_t numElements = size/2; // 2 bytes per fp16
+ armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp
new file mode 100644
index 0000000000..136c0d8a76
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+#include "backends/NeonWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+class NeonConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>
+{
+public:
+ NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp
new file mode 100644
index 0000000000..61f30522a8
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "NeonConvertFp32ToFp16Workload.hpp"
+
+#include "Half.hpp"
+#include "FloatingPointConverter.hpp"
+
+#include "Profiling.hpp"
+#include "backends/WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+NeonConvertFp32ToFp16Workload::NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToFp16Workload", 1, 1);
+ GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertFp32ToFp16Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToFp16Workload_Execute");
+
+ auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+ {
+ auto input = reinterpret_cast<const float*>(src);
+ auto output = reinterpret_cast<Half*>(dst);
+ size_t numElements = size/2; // 2 bytes per fp16
+ armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output);
+ };
+
+ for (const auto& pair : m_TensorHandlePairs)
+ {
+ CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+ }
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp
new file mode 100644
index 0000000000..f48c365c48
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+#include "backends/NeonWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+class NeonConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>
+{
+public:
+ NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+ std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp
index 423f02bcb0..e76afb6cf7 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp
@@ -9,6 +9,9 @@
#include "NeonConvolution2dBaseWorkload.hpp"
+#include "armnn/Types.hpp"
+#include "Half.hpp"
+
namespace armnn
{
@@ -41,28 +44,28 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input,
layerInfo);
}
-template<armnn::DataType dataType>
-NeonConvolution2dBaseWorkload<dataType>::NeonConvolution2dBaseWorkload(const Convolution2dQueueDescriptor& descriptor,
- const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : TypedWorkload<Convolution2dQueueDescriptor, dataType>(descriptor, info)
+template<armnn::DataType... dataTypes>
+NeonConvolution2dBaseWorkload<dataTypes...>::NeonConvolution2dBaseWorkload(
+ const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>(descriptor, info)
{
using arm_compute::NEDirectConvolutionLayer;
- using namespace armcomputetensorutils;
ValidateData();
- // todo: check tensor shapes match
+ // todo: check tensor shapes match.
arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
- BuildArmComputeTensor(m_KernelTensor, m_Data.m_Weight->GetTensorInfo());
+ m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_KernelTensor, m_Data.m_Weight->GetTensorInfo());
- arm_compute::Tensor* optionalBiasTensor = nullptr;
if (m_Data.m_Parameters.m_BiasEnabled)
{
- BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBiasTensor = &m_BiasTensor;
+ m_BiasTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
}
arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
@@ -81,8 +84,8 @@ NeonConvolution2dBaseWorkload<dataType>::NeonConvolution2dBaseWorkload(const Con
{
auto directConvolutionLayer = std::make_unique<arm_compute::NEDirectConvolutionLayer>(memoryManager);
directConvolutionLayer->configure(&input,
- &m_KernelTensor,
- optionalBiasTensor,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
m_ConvolutionLayer.reset(directConvolutionLayer.release());
@@ -91,22 +94,50 @@ NeonConvolution2dBaseWorkload<dataType>::NeonConvolution2dBaseWorkload(const Con
{
auto convolutionLayer = std::make_unique<arm_compute::NEConvolutionLayer>(memoryManager);
convolutionLayer->configure(&input,
- &m_KernelTensor,
- optionalBiasTensor,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
m_ConvolutionLayer.reset(convolutionLayer.release());
}
BOOST_ASSERT(m_ConvolutionLayer);
- using Type = ResolveType<dataType>;
+ armnn::DataType dataType = m_Data.m_Weight->GetTensorInfo().GetDataType();
+
+ switch (dataType)
+ {
+ case DataType::Float16:
+ {
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<Half>());
+ break;
+ }
+ case DataType::Float32:
+ {
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<float>());
+ break;
+ }
+ case DataType::QuantisedAsymm8:
+ {
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<uint8_t>());
+ break;
+ }
+ default:
+ {
+ BOOST_ASSERT_MSG(false, "Unknown DataType.");
+ }
+ }
+}
- InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->template GetConstTensor<Type>());
+template<armnn::DataType... dataTypes>
+void NeonConvolution2dBaseWorkload<dataTypes...>::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
}
-// Generate known implementations for linker
-template class NeonConvolution2dBaseWorkload<DataType::Float32>;
-template class NeonConvolution2dBaseWorkload<DataType::QuantisedAsymm8>;
+// Generates known implementations for linker.
+template class NeonConvolution2dBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>;
+template class NeonConvolution2dBaseWorkload<armnn::DataType::QuantisedAsymm8>;
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp
index d28d50d819..524d2c90b6 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp
@@ -25,11 +25,11 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input,
const TensorInfo& weights,
const TensorInfo& biases);
-template<armnn::DataType dataType>
-class NeonConvolution2dBaseWorkload : public TypedWorkload<Convolution2dQueueDescriptor, dataType>
+template<armnn::DataType... dataTypes>
+class NeonConvolution2dBaseWorkload : public TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>
{
public:
- using TypedWorkload<Convolution2dQueueDescriptor, dataType>::m_Data;
+ using TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>::m_Data;
NeonConvolution2dBaseWorkload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
@@ -38,8 +38,11 @@ public:
protected:
std::unique_ptr<arm_compute::IFunction> m_ConvolutionLayer;
- arm_compute::Tensor m_KernelTensor;
- arm_compute::Tensor m_BiasTensor;
+
+ std::unique_ptr<arm_compute::Tensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp
index f20f2a4ac5..18ec6ca2e7 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp
@@ -18,13 +18,16 @@ NeonConvolution2dFloat32Workload::NeonConvolution2dFloat32Workload(const Convolu
{
if (m_Data.m_Parameters.m_BiasEnabled)
{
- InitialiseArmComputeTensorData(m_BiasTensor, m_Data.m_Bias->template GetConstTensor<float>());
+ InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias);
}
+
+ m_ConvolutionLayer->prepare();
+ FreeUnusedTensors();
}
void NeonConvolution2dFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConvolution2dFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dFloat32Workload_Execute");
m_ConvolutionLayer->run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp
index 56b0848efa..0bb8d69d94 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp
@@ -15,7 +15,7 @@
namespace armnn
{
-class NeonConvolution2dFloat32Workload : public NeonConvolution2dBaseWorkload<DataType::Float32>
+class NeonConvolution2dFloat32Workload : public NeonConvolution2dBaseWorkload<DataType::Float16, DataType::Float32>
{
public:
NeonConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp
index fb91f7b7b2..bb33e939ea 100644
--- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp
@@ -14,14 +14,16 @@ NeonConvolution2dUint8Workload::NeonConvolution2dUint8Workload(const Convolution
{
if (m_Data.m_Parameters.m_BiasEnabled)
{
- InitialiseArmComputeTensorData(m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>());
+ InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>());
}
-}
+ m_ConvolutionLayer->prepare();
+ FreeUnusedTensors();
+}
void NeonConvolution2dUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConvolution2dUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dUint8Workload_Execute");
m_ConvolutionLayer->run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp
new file mode 100644
index 0000000000..58d6061537
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp
@@ -0,0 +1,46 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "NeonDepthwiseConvolutionBaseWorkload.hpp"
+
+#include "backends/ArmComputeTensorUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const TensorInfo& biases)
+{
+ const arm_compute::TensorInfo aclInputInfo =
+ armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo =
+ armcomputetensorutils::BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclWeightsInfo =
+ armcomputetensorutils::BuildArmComputeTensorInfo(weights);
+
+ arm_compute::TensorInfo aclBiasesInfo;
+ arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
+ if (descriptor.m_BiasEnabled)
+ {
+ aclBiasesInfo = armcomputetensorutils::BuildArmComputeTensorInfo(biases);
+ optionalAclBiasesInfo = &aclBiasesInfo;
+ }
+
+ const arm_compute::PadStrideInfo aclPadStrideInfo =
+ armcomputetensorutils::BuildArmComputePadStrideInfo(descriptor);
+ const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+ return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo,
+ &aclWeightsInfo,
+ optionalAclBiasesInfo,
+ &aclOutputInfo,
+ aclPadStrideInfo,
+ aclDepthMultiplier);
+}
+
+}
diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp
new file mode 100644
index 0000000000..0cead354f8
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp
@@ -0,0 +1,19 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/NeonWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const TensorInfo& biases);
+
+} // namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp
index 11e31c727a..f94cd903b6 100644
--- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp
@@ -16,23 +16,17 @@ using namespace armcomputetensorutils;
NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload(
const DepthwiseConvolution2dQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
+ : FloatWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
{
const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
- std::string reasonIfUnsupported;
- if (!IsNeonDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo))
- {
- throw UnimplementedException(reasonIfUnsupported);
- }
+ m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo);
- BuildArmComputeTensor(m_KernelTensor, weightInfo);
-
- arm_compute::Tensor* optionalBias = nullptr;
if (m_Data.m_Parameters.m_BiasEnabled)
{
- BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBias = &m_BiasTensor;
+ m_BiasTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
}
arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
@@ -54,8 +48,8 @@ NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload
m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
static_cast<arm_compute::NEDepthwiseConvolutionLayer3x3*>(
m_pDepthwiseConvolutionLayer.get())->configure(&input,
- &m_KernelTensor,
- optionalBias,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
}
@@ -64,28 +58,37 @@ NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload
m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
static_cast<arm_compute::NEDepthwiseConvolutionLayer*>(
m_pDepthwiseConvolutionLayer.get())->configure(&input,
- &m_KernelTensor,
- optionalBias,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
}
BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
- InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<float>());
+ InitializeArmComputeTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight);
- if (optionalBias)
+ if (m_BiasTensor)
{
- InitialiseArmComputeTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<float>());
+ InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias);
}
+
+ m_pDepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
}
void NeonDepthwiseConvolutionFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "NeonDepthwiseConvolutionFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionFloat32Workload_Execute");
BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
m_pDepthwiseConvolutionLayer->run();
}
+void NeonDepthwiseConvolutionFloat32Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp
index f9e295f568..ece9f1877b 100644
--- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-class NeonDepthwiseConvolutionFloat32Workload : public Float32Workload<DepthwiseConvolution2dQueueDescriptor>
+class NeonDepthwiseConvolutionFloat32Workload : public FloatWorkload<DepthwiseConvolution2dQueueDescriptor>
{
public:
NeonDepthwiseConvolutionFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
@@ -20,8 +20,10 @@ public:
private:
mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer;
- arm_compute::Tensor m_KernelTensor;
- arm_compute::Tensor m_BiasTensor;
+ std::unique_ptr<arm_compute::Tensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp
index bd034c4f80..45fbcb37ab 100644
--- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp
@@ -20,19 +20,13 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload(
{
const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
- std::string reasonIfUnsupported;
- if (!IsNeonDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo))
- {
- throw UnimplementedException(reasonIfUnsupported);
- }
+ m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo);
- BuildArmComputeTensor(m_KernelTensor, weightInfo);
-
- arm_compute::Tensor* optionalBias = nullptr;
if (m_Data.m_Parameters.m_BiasEnabled)
{
- BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBias = &m_BiasTensor;
+ m_BiasTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
}
arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
@@ -54,8 +48,8 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload(
m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
static_cast<arm_compute::NEDepthwiseConvolutionLayer3x3*>(
m_pDepthwiseConvolutionLayer.get())->configure(&input,
- &m_KernelTensor,
- optionalBias,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
}
@@ -64,28 +58,37 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload(
m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
static_cast<arm_compute::NEDepthwiseConvolutionLayer*>(
m_pDepthwiseConvolutionLayer.get())->configure(&input,
- &m_KernelTensor,
- optionalBias,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
&output,
padStrideInfo);
}
BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
- InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>());
+ InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>());
- if (optionalBias)
+ if (m_BiasTensor)
{
- InitialiseArmComputeTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<int32_t>());
+ InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor<int32_t>());
}
+
+ m_pDepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
}
void NeonDepthwiseConvolutionUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "NeonDepthwiseConvolutionUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionUint8Workload_Execute");
BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
m_pDepthwiseConvolutionLayer->run();
}
+void NeonDepthwiseConvolutionUint8Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp
index 9cf272e9f5..aca0ba5337 100644
--- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp
@@ -20,8 +20,10 @@ public:
private:
mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer;
- arm_compute::Tensor m_KernelTensor;
- arm_compute::Tensor m_BiasTensor;
+ std::unique_ptr<arm_compute::Tensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp
index a5eec5cadb..c43cfa9c46 100644
--- a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp
@@ -9,7 +9,7 @@ namespace armnn
{
NeonFloorFloat32Workload::NeonFloorFloat32Workload(const FloorQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<FloorQueueDescriptor>(descriptor, info)
+ : FloatWorkload<FloorQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("NeonFloorFloat32Workload", 1, 1);
@@ -21,7 +21,7 @@ NeonFloorFloat32Workload::NeonFloorFloat32Workload(const FloorQueueDescriptor& d
void NeonFloorFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonFloorFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFloorFloat32Workload_Execute");
m_Layer.run();
}
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp
index f876f1e1bb..56680f1e39 100644
--- a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-class NeonFloorFloat32Workload : public Float32Workload<FloorQueueDescriptor>
+class NeonFloorFloat32Workload : public FloatWorkload<FloorQueueDescriptor>
{
public:
NeonFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp
index e1c4448642..c3af41e20d 100644
--- a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp
@@ -4,16 +4,47 @@
//
#include "NeonFullyConnectedFloat32Workload.hpp"
-#include "backends/CpuTensorHandle.hpp"
+
#include "backends/ArmComputeTensorUtils.hpp"
+#include "backends/ArmComputeUtils.hpp"
+#include "backends/CpuTensorHandle.hpp"
namespace armnn
{
using namespace armcomputetensorutils;
+arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights);
+
+ arm_compute::TensorInfo aclBiases;
+ arm_compute::TensorInfo *optionalAclBiases = nullptr;
+ if (descriptor.m_BiasEnabled)
+ {
+ aclBiases = BuildArmComputeTensorInfo(biases);
+ optionalAclBiases = &aclBiases;
+ }
+
+ const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo =
+ ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor);
+
+
+ return arm_compute::NEFullyConnectedLayer::validate(&aclInput,
+ &aclWeights,
+ optionalAclBiases,
+ &aclOutput,
+ fullyConnectedLayerInfo);
+}
+
NeonFullyConnectedFloat32Workload::NeonFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor,
const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : Float32Workload<FullyConnectedQueueDescriptor>(descriptor, info)
+ : FloatWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
, m_FullyConnectedLayer(memoryManager)
{
m_Data.ValidateInputsOutputs("NeonFullyConnectedFloat32Workload", 1, 1);
@@ -21,33 +52,45 @@ NeonFullyConnectedFloat32Workload::NeonFullyConnectedFloat32Workload(const Fully
arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
- BuildArmComputeTensor(m_WeightsTensor, m_Data.m_Weight->GetTensorInfo());
+ m_WeightsTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo());
- arm_compute::Tensor* optionalBiasTensor = nullptr;
if (m_Data.m_Parameters.m_BiasEnabled)
{
- BuildArmComputeTensor(m_BiasesTensor, m_Data.m_Bias->GetTensorInfo());
- optionalBiasTensor = &m_BiasesTensor;
+ m_BiasesTensor = std::make_unique<arm_compute::Tensor>();
+ BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo());
}
// Construct
- m_FullyConnectedLayer.configure(
- &input, &m_WeightsTensor, optionalBiasTensor, &output, m_Data.m_Parameters.m_TransposeWeightMatrix);
+ arm_compute::FullyConnectedLayerInfo fc_info;
+ fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix;
+ m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
// Allocate
- InitialiseArmComputeTensorData(m_WeightsTensor, m_Data.m_Weight->GetConstTensor<float>());
+ InitializeArmComputeTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight);
- if (optionalBiasTensor)
+ if (m_BiasesTensor)
{
- InitialiseArmComputeTensorData(*optionalBiasTensor, m_Data.m_Bias->GetConstTensor<float>());
+ InitializeArmComputeTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias);
}
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_FullyConnectedLayer.prepare();
+ FreeUnusedTensors();
}
void NeonFullyConnectedFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonFullyConnectedFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFullyConnectedFloat32Workload_Execute");
m_FullyConnectedLayer.run();
}
+void NeonFullyConnectedFloat32Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_WeightsTensor);
+ FreeTensorIfUnused(m_BiasesTensor);
+}
+
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp
index 9c722dc573..684b5e0753 100644
--- a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp
@@ -14,7 +14,13 @@
namespace armnn
{
-class NeonFullyConnectedFloat32Workload : public Float32Workload<FullyConnectedQueueDescriptor>
+arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor);
+
+class NeonFullyConnectedFloat32Workload : public FloatWorkload<FullyConnectedQueueDescriptor>
{
public:
NeonFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info,
@@ -23,8 +29,11 @@ public:
private:
mutable arm_compute::NEFullyConnectedLayer m_FullyConnectedLayer;
- arm_compute::Tensor m_WeightsTensor;
- arm_compute::Tensor m_BiasesTensor;
+
+ std::unique_ptr<arm_compute::Tensor> m_WeightsTensor;
+ std::unique_ptr<arm_compute::Tensor> m_BiasesTensor;
+
+ void FreeUnusedTensors();
};
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp
index 9f79fa09de..a3ae33f41f 100644
--- a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp
@@ -9,9 +9,21 @@
namespace armnn
{
+arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ arm_compute::NormalizationLayerInfo normalizationInfo =
+ CreateAclNormalizationLayerInfoForL2Normalization(input);
+
+ return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo);
+}
+
NeonL2NormalizationFloat32Workload::NeonL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor,
const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : Float32Workload<L2NormalizationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info)
, m_Layer(memoryManager)
{
m_Data.ValidateInputsOutputs("NeonL2NormalizationFloat32Workload", 1, 1);
@@ -23,7 +35,7 @@ NeonL2NormalizationFloat32Workload::NeonL2NormalizationFloat32Workload(const L2N
void NeonL2NormalizationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonL2NormalizationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonL2NormalizationFloat32Workload_Execute");
m_Layer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp
index 2b4a1fef37..c3fcde5a57 100644
--- a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp
@@ -14,7 +14,10 @@
namespace armnn
{
-class NeonL2NormalizationFloat32Workload : public Float32Workload<L2NormalizationQueueDescriptor>
+arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output);
+
+class NeonL2NormalizationFloat32Workload : public FloatWorkload<L2NormalizationQueueDescriptor>
{
public:
NeonL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
diff --git a/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp
new file mode 100644
index 0000000000..ba1369e179
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp
@@ -0,0 +1,22 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "NeonLstmFloat32Workload.hpp"
+
+namespace armnn
+{
+NeonLstmFloat32Workload::NeonLstmFloat32Workload(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<LstmQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonLstmFloat32Workload", 1, 1);
+}
+
+void NeonLstmFloat32Workload::Execute() const
+{
+ throw armnn::Exception("No implementation of Lstm in the Neon backend!");
+}
+
+} // namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp
new file mode 100644
index 0000000000..78ee1da341
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include <backends/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonLstmFloat32Workload : public FloatWorkload<LstmQueueDescriptor>
+{
+public:
+ NeonLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp
index 7520e8768e..30dd283620 100644
--- a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void NeonMergerFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClMergerFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerFloat32Workload_Execute");
NeonBaseMergerWorkload::Execute();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp
index 5c889c2af0..7b8ee9881f 100644
--- a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp
@@ -10,10 +10,10 @@
namespace armnn
{
-class NeonMergerFloat32Workload : public NeonBaseMergerWorkload<DataType::Float32>
+class NeonMergerFloat32Workload : public NeonBaseMergerWorkload<DataType::Float16, DataType::Float32>
{
public:
- using NeonBaseMergerWorkload<DataType::Float32>::NeonBaseMergerWorkload;
+ using NeonBaseMergerWorkload<DataType::Float16, DataType::Float32>::NeonBaseMergerWorkload;
virtual void Execute() const override;
};
diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp
index 51578e5bff..caccdd443a 100644
--- a/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void NeonMergerUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClMergerUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerUint8Workload_Execute");
NeonBaseMergerWorkload::Execute();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp
index 58ce7b74ba..a8a3cd77b4 100644
--- a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp
@@ -9,9 +9,28 @@
namespace armnn
{
+arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it,
+ // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be
+ // ignored for F32 tensors.
+ return arm_compute::NEPixelWiseMultiplication::validate(&aclInput1,
+ &aclInput2,
+ &aclOutput,
+ 1.0f,
+ arm_compute::ConvertPolicy::SATURATE,
+ arm_compute::RoundingPolicy::TO_ZERO);
+}
+
NeonMultiplicationFloat32Workload::NeonMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<MultiplicationQueueDescriptor>(descriptor, info)
+ : FloatWorkload<MultiplicationQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("NeonMultiplicationFloat32Workload", 2, 1);
@@ -32,7 +51,7 @@ NeonMultiplicationFloat32Workload::NeonMultiplicationFloat32Workload(const Multi
void NeonMultiplicationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonMultiplicationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMultiplicationFloat32Workload_Execute");
m_PixelWiseMultiplication.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp
index ed5ead3700..62e84a2e07 100644
--- a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp
@@ -9,8 +9,11 @@
namespace armnn
{
+arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
-class NeonMultiplicationFloat32Workload : public Float32Workload<MultiplicationQueueDescriptor>
+class NeonMultiplicationFloat32Workload : public FloatWorkload<MultiplicationQueueDescriptor>
{
public:
NeonMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp
index 0fd0dcc420..20936a2760 100644
--- a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp
@@ -6,13 +6,28 @@
#include "NeonNormalizationFloat32Workload.hpp"
#include "backends/NeonLayerSupport.hpp"
#include "backends/ArmComputeUtils.hpp"
+#include "backends/ArmComputeTensorUtils.hpp"
namespace armnn
{
+arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ arm_compute::NormalizationLayerInfo normalizationInfo =
+ armcomputetensorutils::BuildArmComputeNormalizationLayerInfo(descriptor);
+
+ return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo);
+}
+
NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor,
- const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : Float32Workload<NormalizationQueueDescriptor>(descriptor, info)
+ const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info)
, m_NormalizationLayer(memoryManager)
{
m_Data.ValidateInputsOutputs("NeonNormalizationFloat32Workload", 1, 1);
@@ -22,7 +37,7 @@ NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const Normali
throw UnimplementedException(reasonIfUnsupported);
}
- // input and output tensors have to have the same dimensionality
+ // Input and output tensors have to have the same dimensionality.
if (info.m_InputTensorInfos[0].GetShape()[1] != info.m_OutputTensorInfos[0].GetShape()[1]
|| info.m_InputTensorInfos[0].GetShape()[0] != info.m_OutputTensorInfos[0].GetShape()[0]
|| info.m_InputTensorInfos[0].GetShape()[3] != info.m_OutputTensorInfos[0].GetShape()[3]
@@ -48,7 +63,7 @@ NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const Normali
void NeonNormalizationFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonNormalizationFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonNormalizationFloat32Workload_Execute");
m_NormalizationLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp
index 24b6da8528..8f0823454b 100644
--- a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp
@@ -12,7 +12,11 @@
namespace armnn
{
-class NeonNormalizationFloat32Workload : public Float32Workload<NormalizationQueueDescriptor>
+arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor);
+
+class NeonNormalizationFloat32Workload : public FloatWorkload<NormalizationQueueDescriptor>
{
public:
NeonNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
diff --git a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp
index e0a0457422..c27797ee4e 100644
--- a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp
@@ -24,10 +24,10 @@ arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input,
armcomputetensorutils::BuildArmComputePermutationVector(mappings));
}
-template <armnn::DataType DataType>
-NeonPermuteWorkload<DataType>::NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor,
+template <armnn::DataType... DataTypes>
+NeonPermuteWorkload<DataTypes...>::NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : TypedWorkload<PermuteQueueDescriptor, DataType>(descriptor, info)
+ : TypedWorkload<PermuteQueueDescriptor, DataTypes...>(descriptor, info)
{
using armcomputetensorutils::BuildArmComputePermutationVector;
@@ -37,18 +37,18 @@ NeonPermuteWorkload<DataType>::NeonPermuteWorkload(const PermuteQueueDescriptor&
arm_compute::ITensor& output = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
- // Run the layer
+ // Run the layer.
m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings));
}
-template <armnn::DataType DataType>
-void NeonPermuteWorkload<DataType>::Execute() const
+template <armnn::DataType... DataTypes>
+void NeonPermuteWorkload<DataTypes...>::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, GetName() + "_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON(GetName() + "_Execute");
m_PermuteFunction.run();
}
-template class NeonPermuteWorkload<DataType::Float32>;
+template class NeonPermuteWorkload<DataType::Float16, DataType::Float32>;
template class NeonPermuteWorkload<DataType::QuantisedAsymm8>;
} // namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp
index 56e8719d6c..06b2dc692b 100644
--- a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp
@@ -7,6 +7,7 @@
#include "backends/Workload.hpp"
#include "backends/WorkloadData.hpp"
+#include "backends/NeonWorkloadUtils.hpp"
#include <armnn/TypesUtils.hpp>
#include <arm_compute/runtime/NEON/functions/NEPermute.h>
@@ -18,13 +19,13 @@ namespace armnn
arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input, const TensorInfo& output,
const PermuteDescriptor& descriptor);
-template <armnn::DataType DataType>
-class NeonPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataType>
+template <armnn::DataType... DataTypes>
+class NeonPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataTypes...>
{
public:
static const std::string& GetName()
{
- static const std::string name = std::string("NeonPermute") + GetDataTypeName(DataType) + "Workload";
+ static const std::string name = std::string("NeonPermuteWorkload");
return name;
}
@@ -32,11 +33,11 @@ public:
void Execute() const override;
private:
- using TypedWorkload<PermuteQueueDescriptor, DataType>::m_Data;
+ using TypedWorkload<PermuteQueueDescriptor, DataTypes...>::m_Data;
mutable arm_compute::NEPermute m_PermuteFunction;
};
-using NeonPermuteFloat32Workload = NeonPermuteWorkload<DataType::Float32>;
+using NeonPermuteFloatWorkload = NeonPermuteWorkload<DataType::Float16, DataType::Float32>;
using NeonPermuteUint8Workload = NeonPermuteWorkload<DataType::QuantisedAsymm8>;
-} //namespace armnn
+} // namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp
index 6d6a492155..3585d36ba3 100644
--- a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp
@@ -25,10 +25,10 @@ arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input,
return arm_compute::NEPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo);
}
-template <armnn::DataType dataType>
-NeonPooling2dBaseWorkload<dataType>::NeonPooling2dBaseWorkload(
+template <armnn::DataType... dataTypes>
+NeonPooling2dBaseWorkload<dataTypes...>::NeonPooling2dBaseWorkload(
const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name)
- : TypedWorkload<Pooling2dQueueDescriptor, dataType>(descriptor, info)
+ : TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>(descriptor, info)
{
m_Data.ValidateInputsOutputs(name, 1, 1);
@@ -40,7 +40,7 @@ NeonPooling2dBaseWorkload<dataType>::NeonPooling2dBaseWorkload(
m_PoolingLayer.configure(&input, &output, layerInfo);
}
-template class NeonPooling2dBaseWorkload<DataType::Float32>;
+template class NeonPooling2dBaseWorkload<DataType::Float16, DataType::Float32>;
template class NeonPooling2dBaseWorkload<DataType::QuantisedAsymm8>;
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp
index 9461982f86..2e85e937fa 100644
--- a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp
@@ -14,12 +14,12 @@ arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input,
const TensorInfo& output,
const Pooling2dDescriptor& descriptor);
-// Base class template providing an implementation of the Pooling2d layer common to all data types
-template <armnn::DataType dataType>
-class NeonPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataType>
+// Base class template providing an implementation of the Pooling2d layer common to all data types.
+template <armnn::DataType... dataTypes>
+class NeonPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>
{
public:
- using TypedWorkload<Pooling2dQueueDescriptor, dataType>::m_Data;
+ using TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>::m_Data;
NeonPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info,
const std::string& name);
diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp
index ba2aa20924..cb690c51b8 100644
--- a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp
@@ -12,13 +12,14 @@ namespace armnn
NeonPooling2dFloat32Workload::NeonPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : NeonPooling2dBaseWorkload<armnn::DataType::Float32>(descriptor, info, "NeonPooling2dFloat32Workload")
+ : NeonPooling2dBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>(descriptor, info,
+ "NeonPooling2dFloat32Workload")
{
}
void NeonPooling2dFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonPooling2dFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dFloat32Workload_Execute");
m_PoolingLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp
index 6cfc9cc96f..36c4e7edf1 100644
--- a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp
@@ -11,7 +11,8 @@
namespace armnn
{
-class NeonPooling2dFloat32Workload : public NeonPooling2dBaseWorkload<armnn::DataType::Float32>
+class NeonPooling2dFloat32Workload : public NeonPooling2dBaseWorkload<armnn::DataType::Float16,
+ armnn::DataType::Float32>
{
public:
NeonPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp
index 0778794081..3e06d08dea 100644
--- a/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp
@@ -18,7 +18,7 @@ NeonPooling2dUint8Workload::NeonPooling2dUint8Workload(const Pooling2dQueueDescr
void NeonPooling2dUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonPooling2dUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dUint8Workload_Execute");
m_PoolingLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp
index 317d16f6bd..93f6eb8ef5 100644
--- a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp
@@ -12,7 +12,7 @@ namespace armnn
NeonReshapeFloat32Workload::NeonReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor,
const WorkloadInfo& info)
- : Float32Workload<ReshapeQueueDescriptor>(descriptor, info)
+ : FloatWorkload<ReshapeQueueDescriptor>(descriptor, info)
{
m_Data.ValidateInputsOutputs("NeonReshapeFloat32Workload", 1, 1);
@@ -24,7 +24,7 @@ NeonReshapeFloat32Workload::NeonReshapeFloat32Workload(const ReshapeQueueDescrip
void NeonReshapeFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonReshapeFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeFloat32Workload_Execute");
m_Layer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp
index 27f4aea9e7..3e5cca1b9e 100644
--- a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-class NeonReshapeFloat32Workload : public Float32Workload<ReshapeQueueDescriptor>
+class NeonReshapeFloat32Workload : public FloatWorkload<ReshapeQueueDescriptor>
{
public:
NeonReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp
index 06f57c1e0f..b31bdcd3d0 100644
--- a/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp
@@ -24,7 +24,7 @@ NeonReshapeUint8Workload::NeonReshapeUint8Workload(const ReshapeQueueDescriptor&
void NeonReshapeUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonReshapeUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeUint8Workload_Execute");
m_Layer.run();
}
} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp
new file mode 100644
index 0000000000..3efffafe25
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "NeonSoftmaxBaseWorkload.hpp"
+
+#include "backends/ArmComputeTensorUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor)
+{
+ // NOTE: We report 4D Softmax as unsupported until full support is added to ACL
+ if(input.GetShape().GetNumDimensions() >= 4u)
+ {
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported");
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::NESoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo, descriptor.m_Beta);
+}
+
+} //namespace armnn
+
diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp
new file mode 100644
index 0000000000..b9b21fb254
--- /dev/null
+++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/NeonWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor);
+
+} //namespace armnn
diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp
index 5e2925ca02..027b508ad5 100644
--- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp
@@ -10,12 +10,12 @@ namespace armnn
NeonSoftmaxFloat32Workload::NeonSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor,
const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
- : Float32Workload<SoftmaxQueueDescriptor>(descriptor, info)
+ : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info)
, m_SoftmaxLayer(memoryManager)
{
m_Data.ValidateInputsOutputs("NeonSoftmaxFloat32Workload", 1, 1);
- // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions
+ // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions.
arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
@@ -24,7 +24,7 @@ NeonSoftmaxFloat32Workload::NeonSoftmaxFloat32Workload(const SoftmaxQueueDescrip
void NeonSoftmaxFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSoftmaxFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxFloat32Workload_Execute");
m_SoftmaxLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp
index 91d25b47f8..3656a26a3c 100644
--- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp
@@ -14,7 +14,7 @@
namespace armnn
{
-class NeonSoftmaxFloat32Workload : public Float32Workload<SoftmaxQueueDescriptor>
+class NeonSoftmaxFloat32Workload : public FloatWorkload<SoftmaxQueueDescriptor>
{
public:
NeonSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp
index eb4a23c13c..4b0c05b25b 100644
--- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp
@@ -32,7 +32,7 @@ NeonSoftmaxUint8Workload::NeonSoftmaxUint8Workload(const SoftmaxQueueDescriptor&
void NeonSoftmaxUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClSoftmaxUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxUint8Workload_Execute");
m_SoftmaxLayer.run();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp
index 13701d2ed3..996fc15adb 100644
--- a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void NeonSplitterFloat32Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSplitterFloat32Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterFloat32Workload_Execute");
NeonBaseSplitterWorkload::Execute();
}
diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp
index 432f5de4eb..9f6dc75499 100644
--- a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp
+++ b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp
@@ -10,10 +10,10 @@
namespace armnn
{
-class NeonSplitterFloat32Workload : public NeonBaseSplitterWorkload<DataType::Float32>
+class NeonSplitterFloat32Workload : public NeonBaseSplitterWorkload<DataType::Float16, DataType::Float32>
{
public:
- using NeonBaseSplitterWorkload<DataType::Float32>::NeonBaseSplitterWorkload;
+ using NeonBaseSplitterWorkload<DataType::Float16, DataType::Float32>::NeonBaseSplitterWorkload;
virtual void Execute() const override;
};
diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp
index 90d24d3ffd..0d6328ff7e 100644
--- a/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp
+++ b/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp
@@ -10,7 +10,7 @@ namespace armnn
void NeonSplitterUint8Workload::Execute() const
{
- ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSplitterUint8Workload_Execute");
+ ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterUint8Workload_Execute");
NeonBaseSplitterWorkload::Execute();
}
diff --git a/src/armnn/backends/OutputHandler.cpp b/src/armnn/backends/OutputHandler.cpp
index 54afe565a9..ccc62c89ce 100644
--- a/src/armnn/backends/OutputHandler.cpp
+++ b/src/armnn/backends/OutputHandler.cpp
@@ -30,12 +30,4 @@ void OutputHandler::CollectWorkloadOutputs(WorkloadDataCollector& dataCollector)
dataCollector.Push(m_TensorHandle.get(), m_TensorInfo);
}
-void OutputHandler::AllocateTensors()
-{
- if (m_TensorHandle)
- {
- m_TensorHandle->Allocate();
- }
-}
-
} // namespace armnn
diff --git a/src/armnn/backends/OutputHandler.hpp b/src/armnn/backends/OutputHandler.hpp
index 9cc87c6095..ed95577cca 100644
--- a/src/armnn/backends/OutputHandler.hpp
+++ b/src/armnn/backends/OutputHandler.hpp
@@ -31,30 +31,27 @@ class WorkloadDataCollector;
class OutputHandler
{
public:
- /// @brief Sets the TensorInfo used by this output handler.
- /// @param tensorInfo TensorInfo for the output.
+ /// @brief - Sets the TensorInfo used by this output handler.
+ /// @param tensorInfo - TensorInfo for the output.
void SetTensorInfo(const TensorInfo& tensorInfo);
- /// @brief Create tensor handlers used by the intermediate tensors. Does not allocate memory.
- /// @param factory Factory to be used for handler creation.
+ /// @brief - Creates tensor handlers used by the intermediate tensors. Does not allocate memory.
+ /// @param factory - Factory to be used for handler creation.
void CreateTensorHandles(const IWorkloadFactory& factory);
- /// @brief Get the matching TensorInfo for the output
- /// @return Reference to the output TensorInfo.
+ /// @brief - Gets the matching TensorInfo for the output.
+ /// @return - References to the output TensorInfo.
const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
- /// @brief Get the allocated tensor memory.
- /// @return Pointer to the tensor memory
+ /// @brief - Gets the allocated tensor memory.
+ /// @return - Pointer to the tensor memory.
ITensorHandle* GetData() const { return m_TensorHandle.get(); }
- /// Fill the outputs for a given queue descriptor
+ /// Fill the outputs for a given queue descriptor.
void CollectWorkloadOutputs(WorkloadDataCollector& dataCollector) const;
void SetData(std::unique_ptr<ITensorHandle> data) { m_TensorHandle = std::move(data); }
- /// @brief Allocate memory for all the tensors assigned to the handlers
- void AllocateTensors();
-
/// @brief Returns true if SetTensorInfo() has been called at least once on this.
bool IsTensorInfoSet() const { return m_bTensorInfoSet; }
private:
diff --git a/src/armnn/backends/RefLayerSupport.cpp b/src/armnn/backends/RefLayerSupport.cpp
index 0b94656ded..ca4fca6f31 100644
--- a/src/armnn/backends/RefLayerSupport.cpp
+++ b/src/armnn/backends/RefLayerSupport.cpp
@@ -10,7 +10,6 @@
#include <armnn/Tensor.hpp>
#include <boost/core/ignore_unused.hpp>
-
#include "InternalTypes.hpp"
using namespace boost;
@@ -27,15 +26,18 @@ bool IsSupportedForDataTypeRef(std::string* reasonIfUnsupported,
{
return IsSupportedForDataTypeGeneric(reasonIfUnsupported,
dataType,
+ &FalseFunc<Params...>,
floatFuncPtr,
uint8FuncPtr,
std::forward<Params>(params)...);
}
bool IsActivationSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
const ActivationDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
+ ignore_unused(output);
ignore_unused(descriptor);
return IsSupportedForDataTypeRef(reasonIfUnsupported,
input.GetDataType(),
@@ -57,6 +59,11 @@ bool IsAdditionSupportedRef(const TensorInfo& input0,
}
bool IsBatchNormalizationSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
const BatchNormalizationDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
@@ -94,12 +101,16 @@ bool IsConvolution2dSupportedRef(const TensorInfo& input,
}
bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
const DepthwiseConvolution2dDescriptor& descriptor,
const TensorInfo& weights,
+ const TensorInfo& biases,
std::string* reasonIfUnsupported)
{
+ ignore_unused(output);
ignore_unused(descriptor);
ignore_unused(weights);
+ ignore_unused(biases);
return IsSupportedForDataTypeRef(reasonIfUnsupported,
input.GetDataType(),
&TrueFunc<>,
@@ -107,10 +118,16 @@ bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input,
}
bool IsFullyConnectedSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
const FullyConnectedDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
+ ignore_unused(output);
ignore_unused(descriptor);
+ ignore_unused(weights);
+ ignore_unused(biases);
return IsSupportedForDataTypeRef(reasonIfUnsupported,
input.GetDataType(),
&TrueFunc<>,
@@ -127,8 +144,10 @@ bool IsInputSupportedRef(const TensorInfo& input,
}
bool IsL2NormalizationSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
std::string* reasonIfUnsupported)
{
+ ignore_unused(output);
return IsSupportedForDataTypeRef(reasonIfUnsupported,
input.GetDataType(),
&TrueFunc<>,
@@ -148,9 +167,11 @@ bool IsMergerSupportedRef(const std::vector<const TensorInfo*> inputs,
bool IsMultiplicationSupportedRef(const TensorInfo& input0,
const TensorInfo& input1,
+ const TensorInfo& output,
std::string* reasonIfUnsupported)
{
ignore_unused(input1);
+ ignore_unused(output);
return IsSupportedForDataTypeRef(reasonIfUnsupported,
input0.GetDataType(),
&TrueFunc<>,
@@ -212,9 +233,11 @@ bool IsResizeBilinearSupportedRef(const TensorInfo& input,
}
bool IsSoftmaxSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
const SoftmaxDescriptor& descriptor,
std::string* reasonIfUnsupported)
{
+ ignore_unused(output);
ignore_unused(descriptor);
return IsSupportedForDataTypeRef(reasonIfUnsupported,
input.GetDataType(),
@@ -264,4 +287,78 @@ bool IsFloorSupportedRef(const TensorInfo& input,
&FalseFuncU8<>);
}
+bool IsLstmSupportedRef(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(outputStateIn);
+ ignore_unused(cellStateIn);
+ ignore_unused(scratchBuffer);
+ ignore_unused(outputStateOut);
+ ignore_unused(cellStateOut);
+ ignore_unused(output);
+ ignore_unused(descriptor);
+ ignore_unused(inputToForgetWeights);
+ ignore_unused(inputToCellWeights);
+ ignore_unused(inputToOutputWeights);
+ ignore_unused(recurrentToForgetWeights);
+ ignore_unused(recurrentToCellWeights);
+ ignore_unused(recurrentToOutputWeights);
+ ignore_unused(forgetGateBias);
+ ignore_unused(cellBias);
+ ignore_unused(outputGateBias);
+ ignore_unused(inputToInputWeights);
+ ignore_unused(recurrentToInputWeights);
+ ignore_unused(cellToInputWeights);
+ ignore_unused(inputGateBias);
+ ignore_unused(projectionWeights);
+ ignore_unused(projectionBias);
+ ignore_unused(cellToForgetWeights);
+ ignore_unused(cellToOutputWeights);
+ return false;
+}
+
+bool IsConvertFp16ToFp32SupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return (IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &FalseInputFuncF32<>,
+ &FalseFuncU8<>) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ output.GetDataType(),
+ &FalseOutputFuncF16<>,
+ &TrueFunc<>,
+ &FalseFuncU8<>));
+}
+
+bool IsConvertFp32ToFp16SupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return (IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ input.GetDataType(),
+ &FalseInputFuncF16<>,
+ &TrueFunc<>,
+ &FalseFuncU8<>) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ output.GetDataType(),
+ &TrueFunc<>,
+ &FalseOutputFuncF32<>,
+ &FalseFuncU8<>));
+}
+
}
diff --git a/src/armnn/backends/RefLayerSupport.hpp b/src/armnn/backends/RefLayerSupport.hpp
index 9db1c14596..5e543ac537 100644
--- a/src/armnn/backends/RefLayerSupport.hpp
+++ b/src/armnn/backends/RefLayerSupport.hpp
@@ -7,11 +7,14 @@
#include <armnn/DescriptorsFwd.hpp>
#include <armnn/Types.hpp>
#include <armnn/Tensor.hpp>
+#include <layers/LstmLayer.hpp>
+#include <boost/optional.hpp>
namespace armnn
{
bool IsActivationSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
const ActivationDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -21,6 +24,11 @@ bool IsAdditionSupportedRef(const TensorInfo& input0,
std::string* reasonIfUnsupported = nullptr);
bool IsBatchNormalizationSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
const BatchNormalizationDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -35,11 +43,16 @@ bool IsConvolution2dSupportedRef(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
const DepthwiseConvolution2dDescriptor& descriptor,
const TensorInfo& weights,
+ const TensorInfo& biases,
std::string* reasonIfUnsupported = nullptr);
bool IsFullyConnectedSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
const FullyConnectedDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -47,14 +60,30 @@ bool IsInputSupportedRef(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsL2NormalizationSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
+bool IsLstmSupportedRef(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr);
+
bool IsMergerSupportedRef(const std::vector<const TensorInfo*> inputs,
const OriginsDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
bool IsMultiplicationSupportedRef(const TensorInfo& input0,
const TensorInfo& input1,
+ const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
bool IsNormalizationSupportedRef(const TensorInfo& input,
@@ -79,6 +108,7 @@ bool IsResizeBilinearSupportedRef(const TensorInfo& input,
std::string* reasonIfUnsupported = nullptr);
bool IsSoftmaxSupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
const SoftmaxDescriptor& descriptor,
std::string* reasonIfUnsupported = nullptr);
@@ -97,4 +127,12 @@ bool IsFloorSupportedRef(const TensorInfo& input,
const TensorInfo& output,
std::string* reasonIfUnsupported = nullptr);
+bool IsConvertFp16ToFp32SupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp32ToFp16SupportedRef(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
}
diff --git a/src/armnn/backends/RefWorkloadFactory.cpp b/src/armnn/backends/RefWorkloadFactory.cpp
index d7d498e89e..9294c5accc 100644
--- a/src/armnn/backends/RefWorkloadFactory.cpp
+++ b/src/armnn/backends/RefWorkloadFactory.cpp
@@ -18,22 +18,15 @@ template <typename F32Workload, typename U8Workload, typename QueueDescriptorTyp
std::unique_ptr<IWorkload> RefWorkloadFactory::MakeWorkload(const QueueDescriptorType& descriptor,
const WorkloadInfo& info) const
{
- if (!IsOperationQueueDescriptor(descriptor) || m_OperationWorkloadsAllowed)
- {
- return armnn::MakeWorkload<F32Workload, U8Workload>(descriptor, info);
- }
- else
- {
- return std::unique_ptr<IWorkload>();
- }
+ return armnn::MakeWorkload<NullWorkload, F32Workload, U8Workload>(descriptor, info);
}
-RefWorkloadFactory::RefWorkloadFactory(bool operationWorkloadsAllowed)
- : m_OperationWorkloadsAllowed(operationWorkloadsAllowed)
+RefWorkloadFactory::RefWorkloadFactory()
{
}
-bool RefWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported)
+bool RefWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported)
{
return IWorkloadFactory::IsLayerSupported(Compute::CpuRef, layer, dataType, outReasonIfUnsupported);
}
@@ -60,7 +53,7 @@ std::unique_ptr<IWorkload> RefWorkloadFactory::CreateInput(const InputQueueDescr
throw InvalidArgumentException("RefWorkloadFactory::CreateInput: data input and output differ in byte count.");
}
- return MakeWorkload<CopyFromCpuToCpuFloat32Workload, CopyFromCpuToCpuUint8Workload>(descriptor, info);
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<IWorkload> RefWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
@@ -79,7 +72,7 @@ std::unique_ptr<IWorkload> RefWorkloadFactory::CreateOutput(const OutputQueueDes
throw InvalidArgumentException("RefWorkloadFactory::CreateOutput: data input and output differ in byte count.");
}
- return MakeWorkload<CopyFromCpuToCpuFloat32Workload, CopyFromCpuToCpuUint8Workload>(descriptor, info);
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<IWorkload> RefWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
@@ -168,25 +161,7 @@ std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreateMemCopy(const MemCop
{
throw InvalidArgumentException("RefWorkloadFactory: CreateMemCopy() expected an input tensor.");
}
- // Create a workload that will copy tensor data from the inputs, which can have a number of different formats,
- // to CPU tensors.
- switch (descriptor.m_Inputs[0]->GetType())
- {
-#if ARMCOMPUTECL_ENABLED
- case ITensorHandle::CL:
- {
- return MakeWorkload<CopyFromClToCpuFloat32Workload, CopyFromClToCpuUint8Workload>(descriptor, info);
- }
-#endif
-#if ARMCOMPUTENEON_ENABLED
- case ITensorHandle::Neon:
- {
- return MakeWorkload<CopyFromNeonToCpuFloat32Workload, CopyFromNeonToCpuUint8Workload>(descriptor, info);
- }
-#endif
- default:
- throw InvalidArgumentException("RefWorkloadFactory: Destination type not supported for MemCopy Workload.");
- }
+ return std::make_unique<CopyMemGenericWorkload>(descriptor, info);
}
std::unique_ptr<IWorkload> RefWorkloadFactory::CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor,
@@ -221,9 +196,29 @@ std::unique_ptr<IWorkload> RefWorkloadFactory::CreateReshape(const ReshapeQueueD
}
std::unique_ptr<IWorkload> RefWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
- const WorkloadInfo& info) const
+ const WorkloadInfo& info) const
{
return MakeWorkload<RefFloorFloat32Workload, NullWorkload>(descriptor, info);
}
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<RefLstmFloat32Workload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<RefConvertFp16ToFp32Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<RefConvertFp32ToFp16Workload>(descriptor, info);
+}
+
} // namespace armnn
diff --git a/src/armnn/backends/RefWorkloadFactory.hpp b/src/armnn/backends/RefWorkloadFactory.hpp
index 3fab490ad8..ee8639f8ed 100644
--- a/src/armnn/backends/RefWorkloadFactory.hpp
+++ b/src/armnn/backends/RefWorkloadFactory.hpp
@@ -8,6 +8,7 @@
#include "OutputHandler.hpp"
#include <boost/core/ignore_unused.hpp>
+#include <boost/optional.hpp>
namespace armnn
{
@@ -24,16 +25,17 @@ constexpr bool IsOperationQueueDescriptor(const ConstantQueueDescriptor&) { retu
template <>
constexpr bool IsOperationQueueDescriptor(const PermuteQueueDescriptor&) { return false; }
-// Reference workload factory
+// Reference workload factory.
class RefWorkloadFactory : public IWorkloadFactory
{
public:
- explicit RefWorkloadFactory(bool operationWorkloadsAllowed = true);
- virtual ~RefWorkloadFactory() { };
+ explicit RefWorkloadFactory();
+ virtual ~RefWorkloadFactory() {}
virtual Compute GetCompute() const override { return Compute::CpuRef; }
- static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported);
+ static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported);
virtual bool SupportsSubTensors() const override { return false; }
@@ -43,7 +45,7 @@ public:
{
boost::ignore_unused(parent, subTensorShape, subTensorOrigin);
return nullptr;
- };
+ }
virtual std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo) const override;
@@ -113,12 +115,20 @@ public:
virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
const WorkloadInfo& info) const override;
+ virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
private:
template <typename F32Workload, typename U8Workload, typename QueueDescriptorType>
std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const;
- const bool m_OperationWorkloadsAllowed;
};
} // namespace armnn
diff --git a/src/armnn/backends/RefWorkloads.hpp b/src/armnn/backends/RefWorkloads.hpp
index ed4fa840da..1defdbbe82 100644
--- a/src/armnn/backends/RefWorkloads.hpp
+++ b/src/armnn/backends/RefWorkloads.hpp
@@ -52,3 +52,6 @@
#include "backends/RefWorkloads/Pooling2d.hpp"
#include "backends/RefWorkloads/RefFakeQuantizationFloat32Workload.hpp"
#include "backends/RefWorkloads/RefPermuteWorkload.hpp"
+#include "backends/RefWorkloads/RefLstmFloat32Workload.hpp"
+#include "backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp"
+#include "backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp"
diff --git a/src/armnn/backends/RefWorkloads/Activation.cpp b/src/armnn/backends/RefWorkloads/Activation.cpp
index ede283cbf9..fdb6091ad7 100644
--- a/src/armnn/backends/RefWorkloads/Activation.cpp
+++ b/src/armnn/backends/RefWorkloads/Activation.cpp
@@ -24,7 +24,7 @@ void Activation(const float* in,
float input = in[i];
float output;
- // compute the result of the activation function
+ // Compute the result of the activation function.
switch (function)
{
case ActivationFunction::Linear:
diff --git a/src/armnn/backends/RefWorkloads/Activation.hpp b/src/armnn/backends/RefWorkloads/Activation.hpp
index 874441c862..4ee604b462 100644
--- a/src/armnn/backends/RefWorkloads/Activation.hpp
+++ b/src/armnn/backends/RefWorkloads/Activation.hpp
@@ -9,7 +9,7 @@
namespace armnn
{
-/// Performs the ActivationFunction elementwise on the inputs to give the outputs
+/// Performs the ActivationFunction elementwise on the inputs to give the outputs.
void Activation(const float* in,
float* out,
const TensorInfo& tensorInfo,
diff --git a/src/armnn/backends/RefWorkloads/Broadcast.hpp b/src/armnn/backends/RefWorkloads/Broadcast.hpp
index b65b57f7a1..bdf03f2a16 100644
--- a/src/armnn/backends/RefWorkloads/Broadcast.hpp
+++ b/src/armnn/backends/RefWorkloads/Broadcast.hpp
@@ -43,7 +43,7 @@ struct BroadcastLoop
}
private:
- // Struct to hold the dimension data
+ // Struct to hold the dimension data.
struct BroadcastDimensionData
{
unsigned int m_DimSize;
diff --git a/src/armnn/backends/RefWorkloads/ConvImpl.cpp b/src/armnn/backends/RefWorkloads/ConvImpl.cpp
index 9ebadacddb..3dcd344101 100644
--- a/src/armnn/backends/RefWorkloads/ConvImpl.cpp
+++ b/src/armnn/backends/RefWorkloads/ConvImpl.cpp
@@ -46,7 +46,7 @@ int32_t QuantizedMultiplierSmallerThanOne::operator*(int32_t rhs) const
int32_t QuantizedMultiplierSmallerThanOne::SaturatingRoundingDoublingHighMul(int32_t a, int32_t b)
{
- // Check for overflow
+ // Check for overflow.
if (a == b && a == std::numeric_limits<int32_t>::min())
{
return std::numeric_limits<int32_t>::max();
diff --git a/src/armnn/backends/RefWorkloads/ConvImpl.hpp b/src/armnn/backends/RefWorkloads/ConvImpl.hpp
index 8b66b0b7d2..b7d5d17a8d 100644
--- a/src/armnn/backends/RefWorkloads/ConvImpl.hpp
+++ b/src/armnn/backends/RefWorkloads/ConvImpl.hpp
@@ -18,7 +18,7 @@
namespace armnn
{
-/// Performs multiplication of a integer with a multiplier which is less than one,
+/// Performs multiplication of an integer with a multiplier which is less than one,
/// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
struct QuantizedMultiplierSmallerThanOne
{
@@ -28,21 +28,21 @@ public:
/// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
QuantizedMultiplierSmallerThanOne(float multiplier);
- /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne()
+ /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
int32_t operator*(int32_t rhs) const;
private:
- /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul()
+ /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
- /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT()
+ /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
static int32_t RoundingDivideByPOT(int32_t x, int exponent);
int32_t m_Multiplier;
int32_t m_RightShift;
};
-/// an implementation shared by normal and depthwise convolution
+/// An implementation shared by normal and depthwise convolution.
template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
static void ConvImpl(ConvData data,
const InputType* inputData,
@@ -55,6 +55,7 @@ static void ConvImpl(ConvData data,
InputType* outputData,
float outputScale,
int32_t outputOffset,
+ const TensorInfo& filterInfo,
bool depthwise = false)
{
if (data.m_Parameters.m_BiasEnabled && !biasData)
@@ -64,7 +65,6 @@ static void ConvImpl(ConvData data,
const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]);
const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
- const TensorInfo& filterInfo = data.m_Weight->GetTensorInfo();
unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1;
unsigned int channelsInput = filterInfo.GetShape()[1];
@@ -84,7 +84,7 @@ static void ConvImpl(ConvData data,
unsigned int hStride = data.m_Parameters.m_StrideY;
unsigned int xStride = data.m_Parameters.m_StrideX;
- // the world's least efficient convolution
+ // The world's least efficient convolution.
for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
{
for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
@@ -93,11 +93,11 @@ static void ConvImpl(ConvData data,
{
for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
{
- // this loop goes over each output element
+ // This loop goes over each output element.
AccumulatorType sum = AccumulatorType();
- // for depthwise, each output channel corresponds to exactly one input channel
- // for normal, must loop over each input channel
+ // For depthwise, each output channel corresponds to exactly one input channel.
+ // For normal, must loop over each input channel.
for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
{
unsigned int depthwiseMultiplierIdx = 0;
@@ -111,11 +111,11 @@ static void ConvImpl(ConvData data,
{
for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
{
- // this loop goes over each input element for each output element
+ // This loop goes over each input element for each output element.
unsigned int filterIndex;
- // since dimensionality of kernel depends on depthwiseness, so does index
+ // Since dimensionality of kernel depends on depthwiseness, so does index.
if (depthwise)
{
filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput +
@@ -138,7 +138,7 @@ static void ConvImpl(ConvData data,
AccumulatorType inputValue;
- // check if we're in the padding
+ // Check if we're in the padding.
if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
xInput < paddingLeft || xInput >= widthInput + paddingLeft )
{
diff --git a/src/armnn/backends/RefWorkloads/FullyConnected.cpp b/src/armnn/backends/RefWorkloads/FullyConnected.cpp
index 8ba11d19c6..1a8263b9a1 100644
--- a/src/armnn/backends/RefWorkloads/FullyConnected.cpp
+++ b/src/armnn/backends/RefWorkloads/FullyConnected.cpp
@@ -18,11 +18,11 @@ void FullyConnected(const float* inputData,
const float* biasData,
bool transposeWeights)
{
- unsigned int N = outputTensorInfo.GetShape()[1]; // Output Vector Size
+ unsigned int N = outputTensorInfo.GetShape()[1]; // Outputs Vector Size.
- BOOST_ASSERT(inputTensorInfo.GetNumDimensions() > 1); // Need some data
+ BOOST_ASSERT(inputTensorInfo.GetNumDimensions() > 1); // Needs some data.
- unsigned int K = 1; // Total number of activations in the input
+ unsigned int K = 1; // Total number of activations in the input.
for (unsigned int i = 1; i < inputTensorInfo.GetNumDimensions(); i++)
{
K *= inputTensorInfo.GetShape()[i];
diff --git a/src/armnn/backends/RefWorkloads/FullyConnected.hpp b/src/armnn/backends/RefWorkloads/FullyConnected.hpp
index 9fa2456110..fa6f54a3ec 100644
--- a/src/armnn/backends/RefWorkloads/FullyConnected.hpp
+++ b/src/armnn/backends/RefWorkloads/FullyConnected.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-/// Performs a matrix multiplication and optionally adds a bias
+/// Performs a matrix multiplication and optionally adds a bias.
void FullyConnected(const float* inputData,
float* outputData,
const TensorInfo& inputTensorInfo,
diff --git a/src/armnn/backends/RefWorkloads/Merger.hpp b/src/armnn/backends/RefWorkloads/Merger.hpp
index 7d1bfab557..1294d05e08 100644
--- a/src/armnn/backends/RefWorkloads/Merger.hpp
+++ b/src/armnn/backends/RefWorkloads/Merger.hpp
@@ -29,7 +29,7 @@ void Merger(const MergerQueueDescriptor& data)
for (unsigned int i=0; i<outputInfo0.GetNumDimensions(); i++)
{
dimensionStride /= outputInfo0.GetShape()[i];
- indices[i] = indexRemainder / dimensionStride; // use integer division to round down
+ indices[i] = indexRemainder / dimensionStride; // Use integer division to round down.
indexRemainder -= indices[i] * dimensionStride;
}
@@ -37,11 +37,11 @@ void Merger(const MergerQueueDescriptor& data)
{
MergerQueueDescriptor::ViewOrigin const& view = data.m_ViewOrigins[viewIdx];
- //split view extents are defined by the size of (the corresponding) input tensor
+ //Split view extents are defined by the size of (the corresponding) input tensor.
const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[viewIdx]);
BOOST_ASSERT(inputInfo.GetNumDimensions() == outputInfo0.GetNumDimensions());
- // check all dimensions to see if this element is inside the given input view
+ // Check all dimensions to see if this element is inside the given input view.
bool insideView = true;
for (unsigned int i=0; i<inputInfo.GetNumDimensions(); i++)
{
@@ -66,13 +66,13 @@ void Merger(const MergerQueueDescriptor& data)
dimensionStride *= inputInfo.GetShape()[i];
}
- //we are within the view, copy input data to the output corresponding to this view
+ //We are within the view, copy input data to the output corresponding to this view.
(GetOutputTensorData<DataType>(0, data))[index] =
(GetInputTensorData<DataType>(viewIdx, data))[inIndex];
- //what should we do if input views overlap on the output tensor?
- //we could error, take the average, or shm else...
- //for now just stop after finding first view (input) that matches.
+ //What should we do if input views overlap on the output tensor?
+ //We could error, take the average, or shm else...
+ //For now just stop after finding first view (input) that matches.
break;
}
}
diff --git a/src/armnn/backends/RefWorkloads/Pooling2d.cpp b/src/armnn/backends/RefWorkloads/Pooling2d.cpp
index a643e67690..4047f061b3 100644
--- a/src/armnn/backends/RefWorkloads/Pooling2d.cpp
+++ b/src/armnn/backends/RefWorkloads/Pooling2d.cpp
@@ -164,7 +164,7 @@ void Pooling2d(const float* in,
Executor execute = GetExecutor(params.m_PoolType);
// Check supported padding methods outside the loop to simplify
- // the inner loop
+ // the inner loop.
if (params.m_PaddingMethod != PaddingMethod::Exclude &&
params.m_PaddingMethod != PaddingMethod::IgnoreValue)
{
@@ -192,7 +192,7 @@ void Pooling2d(const float* in,
float result = defaultInitializer;
float poolAreaSize = boost::numeric_cast<float>((hend - hstart) * (wend - wstart));
- // special case: when the pooling kernel is over a padding region and the padding
+ // Special case: when the pooling kernel is over a padding region and the padding
// size is larger or equal to the kernel and the kernel only covers
// padding and no real values, then we initialize the result as zero
// by convention. This is because we need to choose a value here and
@@ -208,8 +208,8 @@ void Pooling2d(const float* in,
if (clamped && params.m_PaddingMethod == PaddingMethod::Exclude)
{
- // when we exclude the padding, it means we calculate with a smaller
- // kernel size, so I change the divisor here
+ // When we exclude the padding, it means we calculate with a smaller
+ // kernel size, so I changed the divisor here.
poolAreaSize = boost::numeric_cast<float>((hend - hstart) * (wend - wstart));
}
diff --git a/src/armnn/backends/RefWorkloads/Pooling2d.hpp b/src/armnn/backends/RefWorkloads/Pooling2d.hpp
index f88b1a0a4e..cefd022fb3 100644
--- a/src/armnn/backends/RefWorkloads/Pooling2d.hpp
+++ b/src/armnn/backends/RefWorkloads/Pooling2d.hpp
@@ -11,7 +11,7 @@
namespace armnn
{
-/// Computes the Pooling2d operation
+/// Computes the Pooling2d operation.
void Pooling2d(const float* in,
float* out,
const TensorInfo& inputInfo,
diff --git a/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp b/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp
index 0ede46d9fb..9044fca1c2 100644
--- a/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp
@@ -13,7 +13,7 @@
namespace armnn
{
-// Base class template providing an implementation of the Constant layer common to all data types
+// Base class template providing an implementation of the Constant layer common to all data types.
template <armnn::DataType DataType>
class RefBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataType>
{
diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp
index c421b0f212..fbc1f07111 100644
--- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp
@@ -12,15 +12,22 @@
namespace armnn
{
+RefBatchNormalizationFloat32Workload::RefBatchNormalizationFloat32Workload(
+ const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Float32Workload<BatchNormalizationQueueDescriptor>(descriptor, info),
+ m_Mean(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Mean))),
+ m_Variance(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Variance))),
+ m_Beta(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Beta))),
+ m_Gamma(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Gamma))) {}
void RefBatchNormalizationFloat32Workload::Execute() const
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchNormalizationFloat32Workload_Execute");
- const float* var = m_Data.m_Variance->GetConstTensor<float>();
- const float* mean = m_Data.m_Mean->GetConstTensor<float>();
- const float* gamma = m_Data.m_Gamma->GetConstTensor<float>();
- const float* beta = m_Data.m_Beta->GetConstTensor<float>();
+ const float* var = m_Variance->GetConstTensor<float>();
+ const float* mean = m_Mean->GetConstTensor<float>();
+ const float* gamma = m_Gamma->GetConstTensor<float>();
+ const float* beta = m_Beta->GetConstTensor<float>();
auto inputData = GetInputTensorDataFloat(0, m_Data);
auto outputData = GetOutputTensorDataFloat(0, m_Data);
diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp
index cbcdadd749..780c329cc6 100644
--- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp
@@ -14,8 +14,15 @@ namespace armnn
class RefBatchNormalizationFloat32Workload : public Float32Workload<BatchNormalizationQueueDescriptor>
{
public:
- using Float32Workload<BatchNormalizationQueueDescriptor>::Float32Workload;
+ explicit RefBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Mean;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Variance;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Beta;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Gamma;
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp
index 8a48523765..4a8e296619 100644
--- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp
@@ -14,23 +14,30 @@
namespace armnn
{
+RefBatchNormalizationUint8Workload::RefBatchNormalizationUint8Workload(
+ const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Uint8Workload<BatchNormalizationQueueDescriptor>(descriptor, info),
+ m_Mean(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Mean))),
+ m_Variance(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Variance))),
+ m_Beta(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Beta))),
+ m_Gamma(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Gamma))) {}
void RefBatchNormalizationUint8Workload::Execute() const
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchNormalizationUint8Workload_Execute");
const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
- const TensorInfo& varInfo = GetTensorInfo(m_Data.m_Variance);
- const TensorInfo& meanInfo = GetTensorInfo(m_Data.m_Mean);
- const TensorInfo& gammaInfo = GetTensorInfo(m_Data.m_Gamma);
- const TensorInfo& betaInfo = GetTensorInfo(m_Data.m_Beta);
+ const TensorInfo& varInfo = GetTensorInfo(m_Variance.get());
+ const TensorInfo& meanInfo = GetTensorInfo(m_Mean.get());
+ const TensorInfo& gammaInfo = GetTensorInfo(m_Gamma.get());
+ const TensorInfo& betaInfo = GetTensorInfo(m_Beta.get());
const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
auto input = Dequantize(GetInputTensorDataU8(0, m_Data), inputInfo0);
- auto var = Dequantize(m_Data.m_Variance->GetConstTensor<uint8_t>(), varInfo);
- auto mean = Dequantize(m_Data.m_Mean->GetConstTensor<uint8_t>(), meanInfo);
- auto gamma = Dequantize(m_Data.m_Gamma->GetConstTensor<uint8_t>(), gammaInfo);
- auto beta = Dequantize(m_Data.m_Beta->GetConstTensor<uint8_t>(), betaInfo);
+ auto var = Dequantize(m_Variance->GetConstTensor<uint8_t>(), varInfo);
+ auto mean = Dequantize(m_Mean->GetConstTensor<uint8_t>(), meanInfo);
+ auto gamma = Dequantize(m_Gamma->GetConstTensor<uint8_t>(), gammaInfo);
+ auto beta = Dequantize(m_Beta->GetConstTensor<uint8_t>(), betaInfo);
std::vector<float> results(outputInfo.GetNumElements());
BatchNormImpl(m_Data, var.data(), mean.data(), gamma.data(), beta.data(), results.data(), input.data());
diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp
index 57fe995ba5..2c12d28c3f 100644
--- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp
@@ -14,8 +14,15 @@ namespace armnn
class RefBatchNormalizationUint8Workload : public Uint8Workload<BatchNormalizationQueueDescriptor>
{
public:
- using Uint8Workload<BatchNormalizationQueueDescriptor>::Uint8Workload;
+ explicit RefBatchNormalizationUint8Workload(const BatchNormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Mean;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Variance;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Beta;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Gamma;
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp
new file mode 100644
index 0000000000..c4b78014b2
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "RefConvertFp16ToFp32Workload.hpp"
+#include "Half.hpp"
+#include "RefWorkloadUtils.hpp"
+#include "FloatingPointConverter.hpp"
+
+namespace armnn
+{
+
+void RefConvertFp16ToFp32Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp16ToFp32Workload_Execute");
+
+ const Half* const input = GetInputTensorDataHalf(0, m_Data);
+ float* const output = GetOutputTensorDataFloat(0, m_Data);
+
+ unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+ armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output);
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp
new file mode 100644
index 0000000000..34ae35545b
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+
+namespace armnn
+{
+
+class RefConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>
+{
+public:
+ using Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>::Float16ToFloat32Workload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp
new file mode 100644
index 0000000000..3c93297302
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "RefConvertFp32ToFp16Workload.hpp"
+
+#include "Half.hpp"
+#include "FloatingPointConverter.hpp"
+#include "RefWorkloadUtils.hpp"
+
+#include "Profiling.hpp"
+
+namespace armnn
+{
+
+void RefConvertFp32ToFp16Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp32ToFp16Workload_Execute");
+
+ const float* const input = GetInputTensorDataFloat(0, m_Data);
+ Half* const output = GetOutputTensorDataHalf(0, m_Data);
+
+ // convert Fp32 input to Fp16 output
+ unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+ armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output);
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp
new file mode 100644
index 0000000000..903a50449f
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+
+namespace armnn
+{
+
+class RefConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>
+{
+public:
+ using Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>::Float32ToFloat16Workload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp
index 6e4cc69063..4fe823a288 100644
--- a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp
@@ -12,6 +12,12 @@
namespace armnn
{
+RefConvolution2dFloat32Workload::RefConvolution2dFloat32Workload(
+ const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Float32Workload<Convolution2dQueueDescriptor>(descriptor, info),
+ m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))),
+ m_Bias(descriptor.m_Parameters.m_BiasEnabled
+ ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {}
void RefConvolution2dFloat32Workload::Execute() const
{
@@ -19,12 +25,13 @@ void RefConvolution2dFloat32Workload::Execute() const
float* outputData = GetOutputTensorDataFloat(0, m_Data);
const float* inputData = GetInputTensorDataFloat(0, m_Data);
- const float* weightData = m_Data.m_Weight->template GetConstTensor<float>();
+ const float* weightData = m_Weight->template GetConstTensor<float>();
const float* biasData = m_Data.m_Parameters.m_BiasEnabled ?
- m_Data.m_Bias->template GetConstTensor<float>() : nullptr;
+ m_Bias->template GetConstTensor<float>() : nullptr;
+ const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
ConvImpl<armnn::Convolution2dQueueDescriptor, float, float, float>(
- m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0);
+ m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo);
}
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp
index 514369c262..ecf0082f33 100644
--- a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp
@@ -14,8 +14,14 @@ namespace armnn
class RefConvolution2dFloat32Workload : public Float32Workload<Convolution2dQueueDescriptor>
{
public:
- using Float32Workload<Convolution2dQueueDescriptor>::Float32Workload;
+ explicit RefConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
+
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp
index f390baa387..19e9c2ed0a 100644
--- a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp
@@ -12,6 +12,12 @@
namespace armnn
{
+RefConvolution2dUint8Workload::RefConvolution2dUint8Workload(
+ const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Uint8Workload<Convolution2dQueueDescriptor>(descriptor, info),
+ m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))),
+ m_Bias(descriptor.m_Parameters.m_BiasEnabled
+ ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {}
void RefConvolution2dUint8Workload::Execute() const
{
@@ -19,20 +25,21 @@ void RefConvolution2dUint8Workload::Execute() const
const uint8_t* inputData = GetInputTensorDataU8(0, m_Data);
const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
- const uint8_t* weightsData = m_Data.m_Weight->template GetConstTensor<uint8_t>();
- const TensorInfo& weightsInfo = GetTensorInfo(m_Data.m_Weight);
+ const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
+ const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
- m_Data.m_Bias->template GetConstTensor<int32_t>() :
+ m_Bias->template GetConstTensor<int32_t>() :
nullptr;
uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+ const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
ConvImpl<armnn::Convolution2dQueueDescriptor, uint8_t, int32_t, int32_t>(
m_Data,
inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(),
weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
biasData,
- outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset());
+ outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo);
}
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp
index 954a206463..733d2052b2 100644
--- a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp
@@ -14,8 +14,15 @@ namespace armnn
class RefConvolution2dUint8Workload : public Uint8Workload<Convolution2dQueueDescriptor>
{
public:
- using Uint8Workload<Convolution2dQueueDescriptor>::Uint8Workload;
+ explicit RefConvolution2dUint8Workload(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
+
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp
index c631fecb66..f3167e299a 100644
--- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp
@@ -12,6 +12,12 @@
namespace armnn
{
+RefDepthwiseConvolution2dFloat32Workload::RefDepthwiseConvolution2dFloat32Workload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Float32Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info),
+ m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))),
+ m_Bias(descriptor.m_Parameters.m_BiasEnabled
+ ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {}
void RefDepthwiseConvolution2dFloat32Workload::Execute() const
{
@@ -19,12 +25,13 @@ void RefDepthwiseConvolution2dFloat32Workload::Execute() const
float* outputData = GetOutputTensorDataFloat(0, m_Data);
const float* inputData = GetInputTensorDataFloat(0, m_Data);
- const float* weightData = m_Data.m_Weight->template GetConstTensor<float>();
+ const float* weightData = m_Weight->template GetConstTensor<float>();
const float* biasData = m_Data.m_Parameters.m_BiasEnabled ?
- m_Data.m_Bias->template GetConstTensor<float>() : nullptr;
+ m_Bias->template GetConstTensor<float>() : nullptr;
+ const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
ConvImpl<armnn::DepthwiseConvolution2dQueueDescriptor, float, float, float>
- (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, true);
+ (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo, true);
}
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp
index 34e6524684..042e7b3c0a 100644
--- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp
@@ -14,8 +14,14 @@ namespace armnn
class RefDepthwiseConvolution2dFloat32Workload : public Float32Workload<DepthwiseConvolution2dQueueDescriptor>
{
public:
- using Float32Workload<DepthwiseConvolution2dQueueDescriptor>::Float32Workload;
+ explicit RefDepthwiseConvolution2dFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp
index 5a8fb13112..fd5ade5559 100644
--- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp
@@ -13,26 +13,34 @@
namespace armnn
{
+RefDepthwiseConvolution2dUint8Workload::RefDepthwiseConvolution2dUint8Workload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Uint8Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info),
+ m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))),
+ m_Bias(descriptor.m_Parameters.m_BiasEnabled
+ ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {}
+
void RefDepthwiseConvolution2dUint8Workload::Execute() const
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dUint8Workload_Execute");
const uint8_t* inputData = GetInputTensorDataU8(0, m_Data);
const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
- const uint8_t* weightsData = m_Data.m_Weight->template GetConstTensor<uint8_t>();
- const TensorInfo& weightsInfo = GetTensorInfo(m_Data.m_Weight);
+ const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
+ const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
- m_Data.m_Bias->template GetConstTensor<int32_t>() :
+ m_Bias->template GetConstTensor<int32_t>() :
nullptr;
uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+ const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
ConvImpl<armnn::DepthwiseConvolution2dQueueDescriptor, uint8_t, int32_t, int32_t>(
m_Data,
inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(),
weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
biasData,
- outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), true);
+ outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true);
}
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp
index bd9945f529..2c8ed2d084 100644
--- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp
@@ -14,8 +14,13 @@ namespace armnn
class RefDepthwiseConvolution2dUint8Workload : public Uint8Workload<DepthwiseConvolution2dQueueDescriptor>
{
public:
- using Uint8Workload<DepthwiseConvolution2dQueueDescriptor>::Uint8Workload;
+ explicit RefDepthwiseConvolution2dUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp
index 6fe203e5f0..818455e0e9 100644
--- a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp
@@ -12,6 +12,12 @@
namespace armnn
{
+RefFullyConnectedFloat32Workload::RefFullyConnectedFloat32Workload(
+ const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Float32Workload<FullyConnectedQueueDescriptor>(descriptor, info),
+ m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))),
+ m_Bias(descriptor.m_Parameters.m_BiasEnabled
+ ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {}
void RefFullyConnectedFloat32Workload::Execute() const
{
@@ -22,8 +28,8 @@ void RefFullyConnectedFloat32Workload::Execute() const
float* outputData = GetOutputTensorDataFloat(0, m_Data);
const float* inputData = GetInputTensorDataFloat(0, m_Data);
- const float* weightData = m_Data.m_Weight->GetConstTensor<float>();
- const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Data.m_Bias->GetConstTensor<float>() : nullptr;
+ const float* weightData = m_Weight->GetConstTensor<float>();
+ const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->GetConstTensor<float>() : nullptr;
FullyConnected(inputData,
outputData,
diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp
index cb835bd2ce..639d935a16 100644
--- a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp
@@ -14,8 +14,13 @@ namespace armnn
class RefFullyConnectedFloat32Workload : public Float32Workload<FullyConnectedQueueDescriptor>
{
public:
- using Float32Workload<FullyConnectedQueueDescriptor>::Float32Workload;
+ explicit RefFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp
index 0186d3f5e5..cd653657e1 100644
--- a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp
@@ -14,6 +14,12 @@
namespace armnn
{
+RefFullyConnectedUint8Workload::RefFullyConnectedUint8Workload(
+ const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Uint8Workload<FullyConnectedQueueDescriptor>(descriptor, info),
+ m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))),
+ m_Bias(descriptor.m_Parameters.m_BiasEnabled
+ ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {}
void RefFullyConnectedUint8Workload::Execute() const
{
@@ -22,18 +28,18 @@ void RefFullyConnectedUint8Workload::Execute() const
const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
- const uint8_t* weightData = m_Data.m_Weight->GetConstTensor<uint8_t>();
+ const uint8_t* weightData = m_Weight->GetConstTensor<uint8_t>();
auto dequant = Dequantize(GetInputTensorDataU8(0, m_Data), inputInfo);
- auto weight = Dequantize(weightData, m_Data.m_Weight->GetTensorInfo());
+ auto weight = Dequantize(weightData, m_Weight->GetTensorInfo());
- std::vector<float> results(inputInfo.GetNumElements());
+ std::vector<float> results(outputInfo.GetNumElements());
if (m_Data.m_Parameters.m_BiasEnabled)
{
- const int32_t* biasData = m_Data.m_Bias->GetConstTensor<int32_t>();
- auto bias = Dequantize(biasData, m_Data.m_Bias->GetTensorInfo());
+ const int32_t* biasData = m_Bias->GetConstTensor<int32_t>();
+ auto bias = Dequantize(biasData, m_Bias->GetTensorInfo());
FullyConnected(dequant.data(),
results.data(),
diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp
index cd14ea85e0..36e5f631ad 100644
--- a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp
+++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp
@@ -14,8 +14,13 @@ namespace armnn
class RefFullyConnectedUint8Workload : public Uint8Workload<FullyConnectedQueueDescriptor>
{
public:
- using Uint8Workload<FullyConnectedQueueDescriptor>::Uint8Workload;
+ explicit RefFullyConnectedUint8Workload(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
virtual void Execute() const override;
+
+private:
+ std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
+ std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
};
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp
new file mode 100644
index 0000000000..bc33638310
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp
@@ -0,0 +1,16 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "RefLstmFloat32Workload.hpp"
+
+namespace armnn
+{
+
+void RefLstmFloat32Workload::Execute() const
+{
+ throw armnn::Exception("No implementation of Lstm in the Ref backend!");
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp
new file mode 100644
index 0000000000..0acce4d309
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+
+namespace armnn
+{
+
+class RefLstmFloat32Workload : public Float32Workload<LstmQueueDescriptor>
+{
+public:
+ using Float32Workload<LstmQueueDescriptor>::Float32Workload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp
index c743207423..f4dff60ae4 100644
--- a/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp
@@ -17,7 +17,7 @@
namespace armnn
{
-// Helper function to compute "Within" normalization using Krichevsky 2012: Local Brightness Normalization
+// Helper function to compute "Within" normalization using Krichevsky 2012: Local Brightness Normalization.
static void NormalizeWithinUingLbr(const float* inputData,
float* outputData,
const TensorShape& tensorShape,
@@ -80,7 +80,7 @@ static void NormalizeWithinUingLbr(const float* inputData,
}
}
-// Helper function to compute "Across" normalization using Krichevsky 2012: Local Brightness Normalization
+// Helper function to compute "Across" normalization using Krichevsky 2012: Local Brightness Normalization.
void NormalizeAcrossUingLbr(const float* inputData,
float* outputData,
const TensorShape& tensorShape,
diff --git a/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp b/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp
index b2bb8fbf3d..93c883d826 100644
--- a/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp
+++ b/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp
@@ -7,6 +7,7 @@
#include "RefWorkloadUtils.hpp"
#include <Permute.hpp>
+#include "TypeUtils.hpp"
namespace armnn
{
diff --git a/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp b/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp
index 088fe819e5..1df735ea55 100644
--- a/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp
+++ b/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp
@@ -9,6 +9,7 @@
#include <armnn/Tensor.hpp>
#include <armnn/Types.hpp>
+#include <Half.hpp>
#include <boost/polymorphic_cast.hpp>
@@ -70,6 +71,18 @@ float* GetOutputTensorDataFloat(unsigned int idx, const PayloadType& data)
return GetOutputTensorData<float>(idx, data);
}
+template <typename PayloadType>
+const Half* GetInputTensorDataHalf(unsigned int idx, const PayloadType& data)
+{
+ return GetInputTensorData<Half>(idx, data);
+}
+
+template <typename PayloadType>
+Half* GetOutputTensorDataHalf(unsigned int idx, const PayloadType& data)
+{
+ return GetOutputTensorData<Half>(idx, data);
+}
+
////////////////////////////////////////////
/// u8 helpers
////////////////////////////////////////////
diff --git a/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp b/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp
index 7b386ed467..d8bca4be44 100644
--- a/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp
+++ b/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp
@@ -27,7 +27,7 @@ inline float Lerp(float a, float b, float w)
void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, const TensorInfo& outputInfo)
{
- // We follow the definition of TensorFlow and AndroidNN: The top-left corner of a texel in the output
+ // We follow the definition of TensorFlow and AndroidNN: the top-left corner of a texel in the output
// image is projected into the input image to figure out the interpolants and weights. Note that this
// will yield different results than if projecting the centre of output texels.
@@ -39,8 +39,8 @@ void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, co
const unsigned int outputHeight = outputInfo.GetShape()[2];
const unsigned int outputWidth = outputInfo.GetShape()[3];
- // How much to scale pixel coordinates in the output image to get the corresponding pixel coordinates
- // in the input image
+ // How much to scale pixel coordinates in the output image, to get the corresponding pixel coordinates
+ // in the input image.
const float scaleY = boost::numeric_cast<float>(inputHeight) / boost::numeric_cast<float>(outputHeight);
const float scaleX = boost::numeric_cast<float>(inputWidth) / boost::numeric_cast<float>(outputWidth);
@@ -53,33 +53,33 @@ void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, co
{
for (unsigned int y = 0; y < outputHeight; ++y)
{
- // Corresponding real-valued height coordinate in input image
+ // Corresponding real-valued height coordinate in input image.
const float iy = boost::numeric_cast<float>(y) * scaleY;
- // Discrete height coordinate of top-left texel (in the 2x2 texel area used for interpolation)
+ // Discrete height coordinate of top-left texel (in the 2x2 texel area used for interpolation).
const float fiy = floorf(iy);
const unsigned int y0 = boost::numeric_cast<unsigned int>(fiy);
- // Interpolation weight (range [0,1])
+ // Interpolation weight (range [0,1]).
const float yw = iy - fiy;
for (unsigned int x = 0; x < outputWidth; ++x)
{
- // Real-valued and discrete width coordinates in input image
+ // Real-valued and discrete width coordinates in input image.
const float ix = boost::numeric_cast<float>(x) * scaleX;
const float fix = floorf(ix);
const unsigned int x0 = boost::numeric_cast<unsigned int>(fix);
- // Interpolation weight (range [0,1])
+ // Interpolation weight (range [0,1]).
const float xw = ix - fix;
- // Discrete width/height coordinates of texels below and to the right of (x0, y0)
+ // Discrete width/height coordinates of texels below and to the right of (x0, y0).
const unsigned int x1 = std::min(x0 + 1, inputWidth - 1u);
const unsigned int y1 = std::min(y0 + 1, inputHeight - 1u);
// Interpolation
- const float ly0 = Lerp(input.Get(n, c, y0, x0), input.Get(n, c, y0, x1), xw); // lerp along row y0
- const float ly1 = Lerp(input.Get(n, c, y1, x0), input.Get(n, c, y1, x1), xw); // lerp along row y1
+ const float ly0 = Lerp(input.Get(n, c, y0, x0), input.Get(n, c, y0, x1), xw); // lerp along row y0.
+ const float ly1 = Lerp(input.Get(n, c, y1, x0), input.Get(n, c, y1, x1), xw); // lerp along row y1.
const float l = Lerp(ly0, ly1, yw);
output.Get(n, c, y, x) = l;
diff --git a/src/armnn/backends/RefWorkloads/Softmax.cpp b/src/armnn/backends/RefWorkloads/Softmax.cpp
index 58840e3076..c9f0bc5e59 100644
--- a/src/armnn/backends/RefWorkloads/Softmax.cpp
+++ b/src/armnn/backends/RefWorkloads/Softmax.cpp
@@ -11,13 +11,13 @@
namespace armnn
{
-/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo
+/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo.
void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float beta)
{
unsigned int numChannels = tensorInfo.GetShape()[1];
for (unsigned int n = 0; n < tensorInfo.GetShape()[0]; n++)
{
- // find maximum channel
+ // Find maximum channel.
float max = in[n * numChannels];
for (unsigned int c = 1; c < numChannels; c++)
{
@@ -28,7 +28,7 @@ void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float be
}
}
- // exponentiate all values and sum
+ // Exponentiate all values and sum.
std::vector<float> exponentials(numChannels);
float sum = 0.0f;
for (unsigned int c = 0; c < numChannels; c++)
@@ -38,7 +38,7 @@ void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float be
sum += exponentials[c];
}
- // divide exponentials by sum to give outputs
+ // Divide exponentials by sum to give outputs.
for (unsigned int c = 0; c < numChannels; c++)
{
out[n * numChannels + c] = exponentials[c] / sum;
diff --git a/src/armnn/backends/RefWorkloads/Softmax.hpp b/src/armnn/backends/RefWorkloads/Softmax.hpp
index c508ab2b82..f75388dc2b 100644
--- a/src/armnn/backends/RefWorkloads/Softmax.hpp
+++ b/src/armnn/backends/RefWorkloads/Softmax.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo
+/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo.
void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float beta);
} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/Splitter.hpp b/src/armnn/backends/RefWorkloads/Splitter.hpp
index bd5da6cfe2..c12d9368bf 100644
--- a/src/armnn/backends/RefWorkloads/Splitter.hpp
+++ b/src/armnn/backends/RefWorkloads/Splitter.hpp
@@ -31,7 +31,7 @@ void Splitter(const SplitterQueueDescriptor& data)
for (unsigned int i = 0; i<inputInfo0.GetNumDimensions(); i++)
{
dimensionStride /= inputInfo0.GetShape()[i];
- indices[i] = indexRemainder / dimensionStride; // use integer division to round down
+ indices[i] = indexRemainder / dimensionStride; // Use integer division to round down.
indexRemainder -= indices[i] * dimensionStride;
}
@@ -39,11 +39,11 @@ void Splitter(const SplitterQueueDescriptor& data)
{
SplitterQueueDescriptor::ViewOrigin const& view = data.m_ViewOrigins[viewIdx];
- //split view extents are defined by the size of (the corresponding) input tensor
+ //Split view extents are defined by the size of (the corresponding) input tensor.
const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[viewIdx]);
BOOST_ASSERT(outputInfo.GetNumDimensions() == inputInfo0.GetNumDimensions());
- // check all dimensions to see if this element is inside the given input view
+ // Check all dimensions to see if this element is inside the given input view.
bool insideView = true;
for (unsigned int i = 0; i<outputInfo.GetNumDimensions(); i++)
{
@@ -68,7 +68,7 @@ void Splitter(const SplitterQueueDescriptor& data)
dimensionStride *= outputInfo.GetShape()[i];
}
- //we are within the view, copy input data to the output corresponding to this view
+ //We are within the view, to copy input data to the output corresponding to this view.
DataType* outputData = GetOutputTensorData<DataType>(viewIdx, data);
BOOST_ASSERT(outputData);
diff --git a/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp b/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp
index 3994c1f1de..ad0f38e867 100644
--- a/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp
+++ b/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp
@@ -10,7 +10,7 @@
namespace armnn
{
-// Utility class providing access to raw tensor memory based on indices along each dimension
+// Utility class providing access to raw tensor memory based on indices along each dimension.
template <typename DataType>
class TensorBufferArrayView
{
diff --git a/src/armnn/backends/Workload.hpp b/src/armnn/backends/Workload.hpp
index dbc7574d0e..5da03bc61d 100644
--- a/src/armnn/backends/Workload.hpp
+++ b/src/armnn/backends/Workload.hpp
@@ -12,11 +12,11 @@
namespace armnn
{
-// Workload interface to enqueue a layer computation
+// Workload interface to enqueue a layer computation.
class IWorkload
{
public:
- virtual ~IWorkload(){};
+ virtual ~IWorkload() {}
virtual void Execute() const = 0;
};
@@ -46,7 +46,8 @@ protected:
const QueueDescriptor m_Data;
};
-template <typename QueueDescriptor, armnn::DataType DataType>
+// TypedWorkload used
+template <typename QueueDescriptor, armnn::DataType... DataTypes>
class TypedWorkload : public BaseWorkload<QueueDescriptor>
{
public:
@@ -54,27 +55,93 @@ public:
TypedWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info)
: BaseWorkload<QueueDescriptor>(descriptor, info)
{
+ std::vector<armnn::DataType> dataTypes = {DataTypes...};
+ armnn::DataType expectedInputType;
+
+ if (!info.m_InputTensorInfos.empty())
+ {
+ expectedInputType = info.m_InputTensorInfos.front().GetDataType();
+
+ if (std::find(dataTypes.begin(), dataTypes.end(), expectedInputType) == dataTypes.end())
+ {
+ BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type");
+ }
+ BOOST_ASSERT_MSG(std::all_of(std::next(info.m_InputTensorInfos.begin()),
+ info.m_InputTensorInfos.end(),
+ [&](auto it){
+ return it.GetDataType() == expectedInputType;
+ }),
+ "Trying to create workload with incorrect type");
+ }
+ armnn::DataType expectedOutputType;
+
+ if (!info.m_OutputTensorInfos.empty())
+ {
+ expectedOutputType = info.m_OutputTensorInfos.front().GetDataType();
+
+ if (!info.m_InputTensorInfos.empty())
+ {
+ if (expectedOutputType != expectedInputType)
+ {
+ BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type");
+ }
+ }
+ else if (std::find(dataTypes.begin(), dataTypes.end(), expectedOutputType) == dataTypes.end())
+ {
+ BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type");
+ }
+ BOOST_ASSERT_MSG(std::all_of(std::next(info.m_OutputTensorInfos.begin()),
+ info.m_OutputTensorInfos.end(),
+ [&](auto it){
+ return it.GetDataType() == expectedOutputType;
+ }),
+ "Trying to create workload with incorrect type");
+ }
+ }
+};
+
+template <typename QueueDescriptor, armnn::DataType InputDataType, armnn::DataType OutputDataType>
+class MultiTypedWorkload : public BaseWorkload<QueueDescriptor>
+{
+public:
+
+ MultiTypedWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info)
+ : BaseWorkload<QueueDescriptor>(descriptor, info)
+ {
BOOST_ASSERT_MSG(std::all_of(info.m_InputTensorInfos.begin(),
info.m_InputTensorInfos.end(),
[&](auto it){
- return it.GetDataType() == DataType;
+ return it.GetDataType() == InputDataType;
}),
"Trying to create workload with incorrect type");
BOOST_ASSERT_MSG(std::all_of(info.m_OutputTensorInfos.begin(),
info.m_OutputTensorInfos.end(),
[&](auto it){
- return it.GetDataType() == DataType;
+ return it.GetDataType() == OutputDataType;
}),
"Trying to create workload with incorrect type");
}
-
- static constexpr armnn::DataType ms_DataType = DataType;
};
template <typename QueueDescriptor>
+using FloatWorkload = TypedWorkload<QueueDescriptor,
+ armnn::DataType::Float16,
+ armnn::DataType::Float32>;
+
+template <typename QueueDescriptor>
using Float32Workload = TypedWorkload<QueueDescriptor, armnn::DataType::Float32>;
template <typename QueueDescriptor>
using Uint8Workload = TypedWorkload<QueueDescriptor, armnn::DataType::QuantisedAsymm8>;
+template <typename QueueDescriptor>
+using Float16ToFloat32Workload = MultiTypedWorkload<QueueDescriptor,
+ armnn::DataType::Float16,
+ armnn::DataType::Float32>;
+
+template <typename QueueDescriptor>
+using Float32ToFloat16Workload = MultiTypedWorkload<QueueDescriptor,
+ armnn::DataType::Float32,
+ armnn::DataType::Float16>;
+
} //namespace armnn
diff --git a/src/armnn/backends/WorkloadData.cpp b/src/armnn/backends/WorkloadData.cpp
index c951fc5d8d..aa763801ce 100644
--- a/src/armnn/backends/WorkloadData.cpp
+++ b/src/armnn/backends/WorkloadData.cpp
@@ -22,6 +22,8 @@ DataType GetBiasDataType(DataType inputDataType)
{
switch (inputDataType)
{
+ case DataType::Float16:
+ return DataType::Float16;
case DataType::Float32:
return DataType::Float32;
case DataType::QuantisedAsymm8:
@@ -148,7 +150,7 @@ void ValidateBiasTensorQuantization(const TensorInfo& biasTensor, const TensorIn
to_string(biasTensor.GetQuantizationOffset()));
}
const float expectedScale = inputTensorInfo.GetQuantizationScale() * weightsTensorInfo.GetQuantizationScale();
- if (biasTensor.GetQuantizationScale() != expectedScale)
+ if (std::abs(biasTensor.GetQuantizationScale() - expectedScale) > 0.000000001f)
{
// Print the float values with extra precision to see very small differences
std::stringstream msg;
@@ -338,11 +340,11 @@ void SplitterQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
". Number of workloadInfo.m_OutputTensorInfos: " + to_string(workloadInfo.m_OutputTensorInfos.size()));
}
- //the dimensionality of all the windows has to match the dimensionality (not shape) of the input
+ //The dimensionality of all the windows has to match the dimensionality (not shape) of the input.
std::size_t inputDims = workloadInfo.m_InputTensorInfos[0].GetNumDimensions();
for(unsigned int w = 0; w < m_ViewOrigins.size(); ++w )
{
- //check that the dimensionality of input is same as the split windows
+ //Checks that the dimensionality of input is same as the split windows.
ViewOrigin const& e = m_ViewOrigins[w];
if (e.m_Origin.size() != inputDims)
{
@@ -399,11 +401,11 @@ void MergerQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
". Number of workloadInfo.m_InputTensorInfos: " + to_string(workloadInfo.m_InputTensorInfos.size()));
}
- //the dimensionality of all the windows has to match the dimensionality (not shape) of the output
+ //The dimensionality of all the windows has to match the dimensionality (not shape) of the output.
std::size_t outputDims = workloadInfo.m_OutputTensorInfos[0].GetNumDimensions();
for(unsigned int w = 0; w < m_ViewOrigins.size(); ++w )
{
- //check that the dimensionality of output is same as the split windows
+ //Checks that the dimensionality of output is same as the split windows.
ViewOrigin const& e = m_ViewOrigins[w];
if (e.m_Origin.size() != outputDims)
{
@@ -415,7 +417,7 @@ void MergerQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
"tensor has " +
to_string(outputDims) + " dimensions.");
}
- //check that the merge windows are within the output tensor
+ //Checks that the merge windows are within the output tensor.
for (unsigned int i = 0; i < e.m_Origin.size(); ++i)
{
if (e.m_Origin[i] + workloadInfo.m_InputTensorInfos[w].GetShape()[i]
@@ -456,7 +458,7 @@ void FullyConnectedQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c
"bias value tensor descriptor is missing.");
}
- // validate type and quantization values
+ // Validates type and quantization values.
ValidateBiasTensorQuantization(m_Bias->GetTensorInfo(),
workloadInfo.m_InputTensorInfos[0], m_Weight->GetTensorInfo(), "FullyConnectedQueueDescriptor");
@@ -578,7 +580,7 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa
ValidatePointer(m_Weight, "DepthwiseConvolution2dQueueDescriptor", "weight");
ValidateTensorNumDimensions(m_Weight->GetTensorInfo(), "DepthwiseConvolution2dQueueDescriptor", 4, "weight");
- //inputChannels * channelMultiplier should be equal to outputChannels
+ //inputChannels * channelMultiplier should be equal to outputChannels.
const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0];
const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1];
const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[1];
@@ -649,7 +651,7 @@ void ResizeBilinearQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c
ValidateTensorNumDimensions(workloadInfo.m_InputTensorInfos[0], "ResizeBilinearQueueDescriptor", 4, "input");
ValidateTensorNumDimensions(workloadInfo.m_OutputTensorInfos[0], "ResizeBilinearQueueDescriptor", 4, "output");
- // Resize bilinear only changes width and height: batch and channel count must match
+ // Resizes bilinear only changes width and height: batch and channel count must match.
{
const unsigned int inputBatchSize = workloadInfo.m_InputTensorInfos[0].GetShape()[0];
const unsigned int outputBatchSize = workloadInfo.m_OutputTensorInfos[0].GetShape()[0];
@@ -747,4 +749,53 @@ void FloorQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
}
}
+void LstmQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+ ValidateTensorNumDimensions(workloadInfo.m_InputTensorInfos[0], "LstmQueueDescriptor", 2, "input");
+ ValidateTensorNumDimensions(workloadInfo.m_OutputTensorInfos[0], "LstmQueueDescriptor", 2, "output");
+}
+
+void ConvertFp32ToFp16QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+ ValidateSingleInput(workloadInfo, "ConvertFp32ToFp16QueueDescriptor");
+ ValidateSingleOutput(workloadInfo, "ConvertFp32ToFp16QueueDescriptor");
+
+ if (workloadInfo.m_InputTensorInfos[0].GetDataType() != DataType::Float32)
+ {
+ throw InvalidArgumentException("ConvertFp32ToFp16QueueDescriptor: Input tensor type must be Float32.");
+ }
+
+ if (workloadInfo.m_OutputTensorInfos[0].GetDataType() != DataType::Float16)
+ {
+ throw InvalidArgumentException("ConvertFp32ToFp16QueueDescriptor: Output tensor type must be Float16.");
+ }
+
+ ValidateTensorShapesMatch(workloadInfo.m_InputTensorInfos[0],
+ workloadInfo.m_OutputTensorInfos[0],
+ "ConvertFp32ToFp16QueueDescriptor",
+ "input",
+ "output");
+}
+
+void ConvertFp16ToFp32QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+ ValidateSingleInput(workloadInfo, "ConvertFp16ToFp32QueueDescriptor");
+ ValidateSingleOutput(workloadInfo, "ConvertFp16ToFp32QueueDescriptor");
+
+ if (workloadInfo.m_InputTensorInfos[0].GetDataType() != DataType::Float16)
+ {
+ throw InvalidArgumentException("ConvertFp16ToFp32QueueDescriptor: Input tensor type must be Float16.");
+ }
+ if (workloadInfo.m_OutputTensorInfos[0].GetDataType() != DataType::Float32)
+ {
+ throw InvalidArgumentException("ConvertFp16ToFp32QueueDescriptor: Output tensor type must be Float32.");
+ }
+
+ ValidateTensorShapesMatch(workloadInfo.m_InputTensorInfos[0],
+ workloadInfo.m_OutputTensorInfos[0],
+ "ConvertFp16ToFp32QueueDescriptor",
+ "input",
+ "output");
+}
+
} //namespace armnn
diff --git a/src/armnn/backends/WorkloadData.hpp b/src/armnn/backends/WorkloadData.hpp
index 7f8713582f..db266e6df8 100644
--- a/src/armnn/backends/WorkloadData.hpp
+++ b/src/armnn/backends/WorkloadData.hpp
@@ -17,7 +17,7 @@
namespace armnn
{
-//a helper function that returns the bias data type required for given input data type.
+//A helper function that returns the bias data type required for given input data type.
DataType GetBiasDataType(DataType inputDataType);
struct WorkloadInfo;
@@ -38,7 +38,7 @@ protected:
QueueDescriptor& operator=(QueueDescriptor const&) = default;
};
-// Base class for queue descriptors which contain parameters
+// Base class for queue descriptors which contain parameters.
template <typename LayerDescriptor>
struct QueueDescriptorWithParameters : public QueueDescriptor
{
@@ -59,13 +59,13 @@ struct MemCopyQueueDescriptor : QueueDescriptor
using InputQueueDescriptor = MemCopyQueueDescriptor;
using OutputQueueDescriptor = MemCopyQueueDescriptor;
-// Softmax layer workload data
+// Softmax layer workload data.
struct SoftmaxQueueDescriptor : QueueDescriptorWithParameters<SoftmaxDescriptor>
{
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Splitter layer workload data
+// Splitter layer workload data.
struct SplitterQueueDescriptor : QueueDescriptorWithParameters<ViewsDescriptor>
{
struct ViewOrigin
@@ -73,18 +73,18 @@ struct SplitterQueueDescriptor : QueueDescriptorWithParameters<ViewsDescriptor>
ViewOrigin() {}
ViewOrigin(std::vector<unsigned int> const& origin) : m_Origin(origin) {}
- //view origin (size of the vector is the same as number of dimensions of the view)
+ //View origin (size of the vector is the same as number of dimensions of the view).
std::vector<unsigned int> m_Origin;
};
- //view defines a tensor that will be carved from the input tensor.
- //view origins are stored here, the extents are defined by sizes of the output tensors.
+ //View defines a tensor that will be carved from the input tensor.
+ //View origins are stored here, the extents are defined by sizes of the output tensors.
std::vector<ViewOrigin> m_ViewOrigins;
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Merger layer workload data
+// Merger layer workload data.
struct MergerQueueDescriptor : QueueDescriptorWithParameters<OriginsDescriptor>
{
struct ViewOrigin
@@ -92,24 +92,24 @@ struct MergerQueueDescriptor : QueueDescriptorWithParameters<OriginsDescriptor>
ViewOrigin() {}
ViewOrigin(const std::vector<unsigned int>& origin) : m_Origin(origin) {}
- //view origin (size of the vector is the same as number of dimensions of the view)
+ //View origin (size of the vector is the same as number of dimensions of the view).
std::vector<unsigned int> m_Origin;
};
- //view defines a sub-area of the output tensor that will be filled with the corresponding input tensor.
- //view origins are stored here, the extents are defined by sizes of the input tensors.
+ //View defines a sub-area of the output tensor that will be filled with the corresponding input tensor.
+ //View origins are stored here, the extents are defined by sizes of the input tensors.
std::vector<ViewOrigin> m_ViewOrigins;
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Activation layer workload data
+// Activation layer workload data.
struct ActivationQueueDescriptor : QueueDescriptorWithParameters<ActivationDescriptor>
{
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Fully connected layer workload data
+// Fully connected layer workload data.
struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters<FullyConnectedDescriptor>
{
FullyConnectedQueueDescriptor()
@@ -124,19 +124,19 @@ struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters<FullyConnec
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Permute layer workload data
+// Permute layer workload data.
struct PermuteQueueDescriptor : QueueDescriptorWithParameters<PermuteDescriptor>
{
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Pooling 2D layer workload data
+// Pooling 2D layer workload data.
struct Pooling2dQueueDescriptor : QueueDescriptorWithParameters<Pooling2dDescriptor>
{
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Convolution 2D layer workload data
+// Convolution 2D layer workload data.
struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters<Convolution2dDescriptor>
{
Convolution2dQueueDescriptor()
@@ -151,7 +151,7 @@ struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters<Convolution2
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Depthwise Convolution 2D layer workload data
+// Depthwise Convolution 2D layer workload data.
struct DepthwiseConvolution2dQueueDescriptor : QueueDescriptorWithParameters<DepthwiseConvolution2dDescriptor>
{
DepthwiseConvolution2dQueueDescriptor()
@@ -166,25 +166,25 @@ struct DepthwiseConvolution2dQueueDescriptor : QueueDescriptorWithParameters<Dep
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Normalization layer workload data
+// Normalization layer workload data.
struct NormalizationQueueDescriptor : QueueDescriptorWithParameters<NormalizationDescriptor>
{
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Add layer workload data
+// Add layer workload data.
struct AdditionQueueDescriptor : QueueDescriptor
{
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Multiplication layer workload data
+// Multiplication layer workload data.
struct MultiplicationQueueDescriptor : QueueDescriptor
{
void Validate(const WorkloadInfo& workloadInfo) const;
};
-// Batch norm layer workload data
+// Batch norm layer workload data.
struct BatchNormalizationQueueDescriptor : QueueDescriptorWithParameters<BatchNormalizationDescriptor>
{
BatchNormalizationQueueDescriptor()
@@ -249,4 +249,58 @@ struct FloorQueueDescriptor : QueueDescriptor
void Validate(const WorkloadInfo& workloadInfo) const;
};
+struct LstmQueueDescriptor : QueueDescriptorWithParameters<LstmDescriptor>
+{
+ LstmQueueDescriptor()
+ : m_InputToInputWeights(nullptr)
+ , m_InputToForgetWeights(nullptr)
+ , m_InputToCellWeights(nullptr)
+ , m_InputToOutputWeights(nullptr)
+ , m_RecurrentToInputWeights(nullptr)
+ , m_RecurrentToForgetWeights(nullptr)
+ , m_RecurrentToCellWeights(nullptr)
+ , m_RecurrentToOutputWeights(nullptr)
+ , m_CellToInputWeights(nullptr)
+ , m_CellToForgetWeights(nullptr)
+ , m_CellToOutputWeights(nullptr)
+ , m_InputGateBias(nullptr)
+ , m_ForgetGateBias(nullptr)
+ , m_CellBias(nullptr)
+ , m_OutputGateBias(nullptr)
+ , m_ProjectionWeights(nullptr)
+ , m_ProjectionBias(nullptr)
+ {
+ }
+
+ const ConstCpuTensorHandle* m_InputToInputWeights;
+ const ConstCpuTensorHandle* m_InputToForgetWeights;
+ const ConstCpuTensorHandle* m_InputToCellWeights;
+ const ConstCpuTensorHandle* m_InputToOutputWeights;
+ const ConstCpuTensorHandle* m_RecurrentToInputWeights;
+ const ConstCpuTensorHandle* m_RecurrentToForgetWeights;
+ const ConstCpuTensorHandle* m_RecurrentToCellWeights;
+ const ConstCpuTensorHandle* m_RecurrentToOutputWeights;
+ const ConstCpuTensorHandle* m_CellToInputWeights;
+ const ConstCpuTensorHandle* m_CellToForgetWeights;
+ const ConstCpuTensorHandle* m_CellToOutputWeights;
+ const ConstCpuTensorHandle* m_InputGateBias;
+ const ConstCpuTensorHandle* m_ForgetGateBias;
+ const ConstCpuTensorHandle* m_CellBias;
+ const ConstCpuTensorHandle* m_OutputGateBias;
+ const ConstCpuTensorHandle* m_ProjectionWeights;
+ const ConstCpuTensorHandle* m_ProjectionBias;
+
+ void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
+struct ConvertFp16ToFp32QueueDescriptor : QueueDescriptor
+{
+ void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
+struct ConvertFp32ToFp16QueueDescriptor : QueueDescriptor
+{
+ void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
} //namespace armnn
diff --git a/src/armnn/backends/WorkloadFactory.cpp b/src/armnn/backends/WorkloadFactory.cpp
index 4e94d7701c..1b3f29421a 100644
--- a/src/armnn/backends/WorkloadFactory.cpp
+++ b/src/armnn/backends/WorkloadFactory.cpp
@@ -20,7 +20,40 @@
namespace armnn
{
-bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, DataType dataType,
+namespace
+{
+ const TensorInfo OverrideDataType(const TensorInfo& info, boost::optional<DataType> type)
+ {
+ if (type == boost::none)
+ {
+ return info;
+ }
+
+ return TensorInfo(info.GetShape(), type.get(), info.GetQuantizationScale(), info.GetQuantizationOffset());
+ }
+
+ boost::optional<DataType> GetBiasTypeFromWeightsType(boost::optional<DataType> weightsType)
+ {
+ if (weightsType == boost::none)
+ {
+ return weightsType;
+ }
+
+ switch(weightsType.get())
+ {
+ case DataType::Float16:
+ case DataType::Float32:
+ return weightsType;
+ case DataType::QuantisedAsymm8:
+ return DataType::Signed32;
+ default:
+ BOOST_ASSERT_MSG(false, "GetBiasTypeFromWeightsType(): Unsupported data type.");
+ }
+ return boost::none;
+ }
+}
+
+bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, boost::optional<DataType> dataType,
std::string& outReasonIfUnsupported)
{
constexpr size_t reasonCapacity = 1024;
@@ -32,7 +65,13 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
{
auto cLayer = boost::polymorphic_downcast<const ActivationLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsActivationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity);
+ const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ result = IsActivationSupported(compute,
+ OverrideDataType(input, dataType),
+ OverrideDataType(output, dataType),
+ cLayer->GetParameters(),
+ reason,
+ reasonCapacity);
break;
}
case LayerType::Addition:
@@ -40,30 +79,64 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo();
const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
- result = IsAdditionSupported(compute, input0, input1, output, reason, reasonCapacity);
+ result = IsAdditionSupported(compute,
+ OverrideDataType(input0, dataType),
+ OverrideDataType(input1, dataType),
+ OverrideDataType(output, dataType),
+ reason,
+ reasonCapacity);
break;
}
case LayerType::BatchNormalization:
{
auto cLayer = boost::polymorphic_downcast<const BatchNormalizationLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsBatchNormalizationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity);
+ const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ const TensorInfo& mean = cLayer->m_Mean->GetTensorInfo();
+ const TensorInfo& var = cLayer->m_Variance->GetTensorInfo();
+ const TensorInfo& beta = cLayer->m_Beta->GetTensorInfo();
+ const TensorInfo& gamma = cLayer->m_Gamma->GetTensorInfo();
+ result = IsBatchNormalizationSupported(compute,
+ OverrideDataType(input, dataType),
+ OverrideDataType(output, dataType),
+ OverrideDataType(mean, dataType),
+ OverrideDataType(var, dataType),
+ OverrideDataType(beta, dataType),
+ OverrideDataType(gamma, dataType),
+ cLayer->GetParameters(),
+ reason, reasonCapacity);
break;
}
case LayerType::Constant:
{
const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
- result = IsConstantSupported(compute, output, reason, reasonCapacity);
+ result = IsConstantSupported(compute, OverrideDataType(output, dataType), reason, reasonCapacity);
break;
}
- case LayerType::Convolution2d:
+ case LayerType::ConvertFp16ToFp32:
{
- auto cLayer = boost::polymorphic_downcast<const Convolution2dLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ result = IsConvertFp16ToFp32Supported(compute, input, output, reason, reasonCapacity);
+ break;
+ }
+ case LayerType::ConvertFp32ToFp16:
+ {
+ const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+ const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ result = IsConvertFp32ToFp16Supported(compute, input, output, reason, reasonCapacity);
+ break;
+ }
+ case LayerType::Convolution2d:
+ {
+ auto cLayer = boost::polymorphic_downcast<const Convolution2dLayer*>(&layer);
+ const TensorInfo input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(), dataType);
+ const TensorInfo output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType);
BOOST_ASSERT(cLayer->m_Weight.get() != nullptr);
- const TensorInfo * biasInfo = nullptr;
+ TensorInfo biasInfo;
+ const TensorInfo * biasInfoPtr = nullptr;
+ static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16);
static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32);
static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32);
@@ -72,21 +145,27 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
if (descriptor.m_BiasEnabled)
{
BOOST_ASSERT(cLayer->m_Bias.get() != nullptr);
- biasInfo = &(cLayer->m_Bias->GetTensorInfo());
+ biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType));
+ biasInfoPtr = &biasInfo;
}
else
{
- // If biases are not enabled I pass a dummy tensorinfo for the validation
+ // If biases are not enabled pass a dummy tensorinfo for the validation.
switch(input.GetDataType())
{
+ case DataType::Float16:
+ {
+ biasInfoPtr = &dummyFloat16Bias;
+ break;
+ }
case DataType::Float32:
{
- biasInfo = &dummyFloat32Bias;
+ biasInfoPtr = &dummyFloat32Bias;
break;
}
case DataType::QuantisedAsymm8:
{
- biasInfo = &dummyQA8Bias;
+ biasInfoPtr = &dummyQA8Bias;
break;
}
default:
@@ -100,16 +179,16 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
input,
output,
descriptor,
- cLayer->m_Weight->GetTensorInfo(),
- *biasInfo,
+ OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType),
+ *biasInfoPtr,
reason,
reasonCapacity);
break;
}
case LayerType::MemCopy:
{
- // MemCopy supported for CpuRef, CpuAcc and GpuAcc backends
- // (also treat Undefined as CpuRef to avoid breaking lots of Unit tests)
+ // MemCopy supported for CpuRef, CpuAcc and GpuAcc backends,
+ // (also treat Undefined as CpuRef to avoid breaking lots of Unit tests).
result = compute == Compute::CpuRef || compute == Compute::Undefined
|| compute == Compute::CpuAcc || compute == Compute::GpuAcc;
strcpy(reason, "Unsupported backend type");
@@ -118,66 +197,314 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
case LayerType::DepthwiseConvolution2d:
{
auto cLayer = boost::polymorphic_downcast<const DepthwiseConvolution2dLayer*>(&layer);
- const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsDepthwiseConvolutionSupported(compute, input, cLayer->GetParameters(),
- cLayer->m_Weight->GetTensorInfo(), reason, reasonCapacity);
+ const TensorInfo& input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(),
+ dataType);
+ const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType);
+ BOOST_ASSERT(cLayer->m_Weight.get() != nullptr);
+
+ TensorInfo biasInfo;
+ const TensorInfo * biasInfoPtr = nullptr;
+ static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16);
+ static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32);
+ static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32);
+
+ const DepthwiseConvolution2dDescriptor& descriptor = cLayer->GetParameters();
+ if (descriptor.m_BiasEnabled)
+ {
+ BOOST_ASSERT(cLayer->m_Bias.get() != nullptr);
+ biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType));
+ biasInfoPtr = &biasInfo;
+ }
+ else
+ {
+ // If biases are not enabled pass a dummy tensorinfo for the validation
+ switch(input.GetDataType())
+ {
+ case DataType::Float16:
+ {
+ biasInfoPtr = &dummyFloat16Bias;
+ break;
+ }
+ case DataType::Float32:
+ {
+ biasInfoPtr = &dummyFloat32Bias;
+ break;
+ }
+ case DataType::QuantisedAsymm8:
+ {
+ biasInfoPtr = &dummyQA8Bias;
+ break;
+ }
+ default:
+ {
+ BOOST_ASSERT_MSG(false, "Unexpected bias type");
+ }
+ }
+ }
+
+
+ result = IsDepthwiseConvolutionSupported(compute,
+ input,
+ output,
+ descriptor,
+ OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType),
+ *biasInfoPtr,
+ reason,
+ reasonCapacity);
break;
}
case LayerType::FakeQuantization:
{
auto cLayer = boost::polymorphic_downcast<const FakeQuantizationLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsFakeQuantizationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity);
+ result = IsFakeQuantizationSupported(compute, OverrideDataType(input, dataType), cLayer->GetParameters(),
+ reason, reasonCapacity);
break;
}
case LayerType::Floor:
{
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
- result = IsFloorSupported(compute, input, output, reason, reasonCapacity);
+ result = IsFloorSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType),
+ reason, reasonCapacity);
break;
}
case LayerType::FullyConnected:
{
auto cLayer = boost::polymorphic_downcast<const FullyConnectedLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsFullyConnectedSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity);
+ const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ BOOST_ASSERT(cLayer->m_Weight.get() != nullptr);
+
+ TensorInfo biasInfo;
+ const TensorInfo * biasInfoPtr = nullptr;
+ static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16);
+ static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32);
+ static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32);
+
+ const FullyConnectedDescriptor& descriptor = cLayer->GetParameters();
+ if (descriptor.m_BiasEnabled)
+ {
+ BOOST_ASSERT(cLayer->m_Bias.get() != nullptr);
+ biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType));
+ biasInfoPtr = &biasInfo;
+ }
+ else
+ {
+ // If biases are not enabled pass a dummy tensorinfo for the validation
+ switch(input.GetDataType())
+ {
+ case DataType::Float16:
+ {
+ biasInfoPtr = &dummyFloat16Bias;
+ break;
+ }
+ case DataType::Float32:
+ {
+ biasInfoPtr = &dummyFloat32Bias;
+ break;
+ }
+ case DataType::QuantisedAsymm8:
+ {
+ biasInfoPtr = &dummyQA8Bias;
+ break;
+ }
+ default:
+ {
+ BOOST_ASSERT_MSG(false, "Unexpected bias type");
+ }
+ }
+ }
+
+ result = IsFullyConnectedSupported(compute,
+ OverrideDataType(input, dataType),
+ OverrideDataType(output, dataType),
+ OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType),
+ *biasInfoPtr,
+ descriptor,
+ reason,
+ reasonCapacity);
break;
}
case LayerType::Input:
{
const TensorInfo& input = layer.GetOutputSlot(0).GetTensorInfo();
- result = IsInputSupported(compute, input, reason, reasonCapacity);
+ result = IsInputSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity);
break;
}
case LayerType::L2Normalization:
{
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsL2NormalizationSupported(compute, input, reason, reasonCapacity);
+ const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ result = IsL2NormalizationSupported(compute, OverrideDataType(input, dataType),
+ OverrideDataType(output, dataType), reason, reasonCapacity);
+ break;
+ }
+ case LayerType::Lstm:
+ {
+ auto cLayer = boost::polymorphic_downcast<const LstmLayer*>(&layer);
+ const LstmDescriptor& descriptor = cLayer->GetParameters();
+
+ // All inputs.
+ const TensorInfo& input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(),
+ dataType);
+ const TensorInfo& outputStateIn = OverrideDataType(layer.GetInputSlot(1).GetConnection()->GetTensorInfo(),
+ dataType);
+ const TensorInfo& cellStateIn = OverrideDataType(layer.GetInputSlot(2).GetConnection()->GetTensorInfo(),
+ dataType);
+ // All outputs
+ const TensorInfo& scratchBuffer = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType);
+ const TensorInfo& outputStateOut = OverrideDataType(layer.GetOutputSlot(1).GetTensorInfo(), dataType);
+ const TensorInfo& cellStateOut = OverrideDataType(layer.GetOutputSlot(2).GetTensorInfo(), dataType);
+ const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(3).GetTensorInfo(), dataType);
+
+ // Basic parameters
+ const TensorInfo& inputToForgetWeights
+ = OverrideDataType(cLayer->m_BasicParameters.m_InputToForgetWeights->GetTensorInfo(), dataType);
+ const TensorInfo& inputToCellWeights
+ = OverrideDataType(cLayer->m_BasicParameters.m_InputToCellWeights->GetTensorInfo(), dataType);
+ const TensorInfo& inputToOutputWeights
+ = OverrideDataType(cLayer->m_BasicParameters.m_InputToOutputWeights->GetTensorInfo(), dataType);
+ const TensorInfo& recurrentToForgetWeights
+ = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToForgetWeights->GetTensorInfo(), dataType);
+ const TensorInfo& recurrentToCellWeights
+ = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToCellWeights->GetTensorInfo(), dataType);
+ const TensorInfo& recurrentToOutputWeights
+ = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToOutputWeights->GetTensorInfo(), dataType);
+ const TensorInfo& forgetGateBias
+ = OverrideDataType(cLayer->m_BasicParameters.m_ForgetGateBias->GetTensorInfo(), dataType);
+ const TensorInfo& cellBias
+ = OverrideDataType(cLayer->m_BasicParameters.m_CellBias->GetTensorInfo(), dataType);
+ const TensorInfo& outputGateBias
+ = OverrideDataType(cLayer->m_BasicParameters.m_OutputGateBias->GetTensorInfo(), dataType);
+
+ // Optional parameters
+ const TensorInfo* inputToInputWeights = nullptr;
+ const TensorInfo* recurrentToInputWeights = nullptr;
+ const TensorInfo* cellToInputWeights = nullptr;
+ const TensorInfo* inputGateBias = nullptr;
+ const TensorInfo* projectionWeights = nullptr;
+ const TensorInfo* projectionBias = nullptr;
+ const TensorInfo* cellToForgetWeights = nullptr;
+ const TensorInfo* cellToOutputWeights = nullptr;
+
+ TensorInfo optInputToInputWeights;
+ TensorInfo optRecurrentToInputWeights;
+ TensorInfo optCellToInputWeights;
+ TensorInfo optInputGateBias;
+ TensorInfo optProjectionWeights;
+ TensorInfo optProjectionBias;
+ TensorInfo optCellToForgetWeights;
+ TensorInfo optCellToOutputWeights;
+
+ if(!descriptor.m_CifgEnabled)
+ {
+ optInputToInputWeights =
+ OverrideDataType(cLayer->m_CifgParameters.m_InputToInputWeights->GetTensorInfo(), dataType);
+ inputToInputWeights = &optInputToInputWeights;
+
+ optRecurrentToInputWeights =
+ OverrideDataType(cLayer->m_CifgParameters.m_RecurrentToInputWeights->GetTensorInfo(), dataType);
+ recurrentToInputWeights = &optRecurrentToInputWeights;
+ if (cLayer->m_CifgParameters.m_CellToInputWeights != nullptr)
+ {
+ optCellToInputWeights =
+ OverrideDataType(cLayer->m_CifgParameters.m_CellToInputWeights->GetTensorInfo(), dataType);
+ cellToInputWeights = &optCellToInputWeights;
+ }
+ optInputGateBias =
+ OverrideDataType(cLayer->m_CifgParameters.m_InputGateBias->GetTensorInfo(), dataType);
+ inputGateBias = &optInputGateBias;
+ }
+
+ if(descriptor.m_ProjectionEnabled)
+ {
+ optProjectionWeights =
+ OverrideDataType(cLayer->m_ProjectionParameters.m_ProjectionWeights->GetTensorInfo(), dataType);
+ projectionWeights = &optProjectionWeights;
+ if (cLayer->m_ProjectionParameters.m_ProjectionBias != nullptr)
+ {
+ optProjectionBias =
+ OverrideDataType(cLayer->m_ProjectionParameters.m_ProjectionBias->GetTensorInfo(), dataType);
+ projectionBias = &optProjectionBias;
+ }
+ }
+
+ if(descriptor.m_PeepholeEnabled)
+ {
+ optCellToForgetWeights =
+ OverrideDataType(cLayer->m_PeepholeParameters.m_CellToForgetWeights->GetTensorInfo(), dataType);
+ cellToForgetWeights = &optCellToForgetWeights;
+ optCellToOutputWeights =
+ OverrideDataType(cLayer->m_PeepholeParameters.m_CellToOutputWeights->GetTensorInfo(), dataType);
+ cellToOutputWeights = &optCellToOutputWeights;
+ }
+
+ result = IsLstmSupported(compute,
+ input,
+ outputStateIn,
+ cellStateIn,
+ scratchBuffer,
+ outputStateOut,
+ cellStateOut,
+ output,
+ descriptor,
+ inputToForgetWeights,
+ inputToCellWeights,
+ inputToOutputWeights,
+ recurrentToForgetWeights,
+ recurrentToCellWeights,
+ recurrentToOutputWeights,
+ forgetGateBias,
+ cellBias,
+ outputGateBias,
+ inputToInputWeights,
+ recurrentToInputWeights,
+ cellToInputWeights,
+ inputGateBias,
+ projectionWeights,
+ projectionBias,
+ cellToForgetWeights,
+ cellToOutputWeights,
+ reason,
+ reasonCapacity);
break;
}
case LayerType::Merger:
{
auto cLayer = boost::polymorphic_downcast<const MergerLayer*>(&layer);
- // Get vector of all inputs
- auto getTensorInfo = [](const InputSlot& slot)
+ // Get vector of all inputs.
+ auto getTensorInfo = [&dataType](const InputSlot& slot)
{
- return &slot.GetConnectedOutputSlot()->GetTensorInfo();
+ return OverrideDataType(slot.GetConnectedOutputSlot()->GetTensorInfo(), dataType);
};
- auto begin = boost::make_transform_iterator(layer.GetInputSlots().begin(), getTensorInfo);
- auto end = boost::make_transform_iterator(layer.GetInputSlots().end(), getTensorInfo);
+ auto beginI = boost::make_transform_iterator(layer.GetInputSlots().begin(), getTensorInfo);
+ auto endI = boost::make_transform_iterator(layer.GetInputSlots().end(), getTensorInfo);
+ std::vector<TensorInfo> inputs(beginI, endI);
- std::vector<const TensorInfo*> inputs(begin, end);
+ auto getTensorInfoPtr = [](const TensorInfo& info)
+ {
+ return &info;
+ };
+ auto beginPtr = boost::make_transform_iterator(inputs.begin(), getTensorInfoPtr);
+ auto endPtr = boost::make_transform_iterator(inputs.end(), getTensorInfoPtr);
+ std::vector<const TensorInfo*> inputPtrs(beginPtr, endPtr);
- result = IsMergerSupported(compute, inputs, cLayer->GetParameters(), reason, reasonCapacity);
+ result = IsMergerSupported(compute, inputPtrs, cLayer->GetParameters(), reason, reasonCapacity);
break;
}
case LayerType::Multiplication:
{
const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo();
- result = IsMultiplicationSupported(compute, input0, input1, reason, reasonCapacity);
+ const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ result = IsMultiplicationSupported(compute,
+ OverrideDataType(input0, dataType),
+ OverrideDataType(input1, dataType),
+ OverrideDataType(output, dataType),
+ reason,
+ reasonCapacity);
break;
}
case LayerType::Normalization:
@@ -185,13 +512,15 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
auto cLayer = boost::polymorphic_downcast<const NormalizationLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
- result = IsNormalizationSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity);
+ result = IsNormalizationSupported(compute, OverrideDataType(input, dataType),
+ OverrideDataType(output, dataType), cLayer->GetParameters(), reason,
+ reasonCapacity);
break;
}
case LayerType::Output:
{
const TensorInfo& output = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsOutputSupported(compute, output, reason, reasonCapacity);
+ result = IsOutputSupported(compute, OverrideDataType(output, dataType), reason, reasonCapacity);
break;
}
case LayerType::Permute:
@@ -199,7 +528,8 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
auto cLayer = boost::polymorphic_downcast<const PermuteLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
- result = IsPermuteSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity);
+ result = IsPermuteSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType),
+ cLayer->GetParameters(), reason, reasonCapacity);
break;
}
case LayerType::Pooling2d:
@@ -207,33 +537,38 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
auto cLayer = boost::polymorphic_downcast<const Pooling2dLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
- result = IsPooling2dSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity);
+ result = IsPooling2dSupported(compute, OverrideDataType(input, dataType),
+ OverrideDataType(output, dataType), cLayer->GetParameters(), reason,
+ reasonCapacity);
break;
}
case LayerType::Reshape:
{
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsReshapeSupported(compute, input, reason, reasonCapacity);
+ result = IsReshapeSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity);
break;
}
case LayerType::ResizeBilinear:
{
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsResizeBilinearSupported(compute, input, reason, reasonCapacity);
+ result = IsResizeBilinearSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity);
break;
}
case LayerType::Softmax:
{
auto cLayer = boost::polymorphic_downcast<const SoftmaxLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsSoftmaxSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity);
+ const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+ result = IsSoftmaxSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType),
+ cLayer->GetParameters(), reason, reasonCapacity);
break;
}
case LayerType::Splitter:
{
auto cLayer = boost::polymorphic_downcast<const SplitterLayer*>(&layer);
const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
- result = IsSplitterSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity);
+ result = IsSplitterSupported(compute, OverrideDataType(input, dataType), cLayer->GetParameters(), reason,
+ reasonCapacity);
break;
}
default:
@@ -248,7 +583,8 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat
return result;
}
-bool IWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported)
+bool IWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported)
{
return IsLayerSupported(layer.GetComputeDevice(), layer, dataType, outReasonIfUnsupported);
}
diff --git a/src/armnn/backends/WorkloadFactory.hpp b/src/armnn/backends/WorkloadFactory.hpp
index 5791c1b46f..c211a290b3 100644
--- a/src/armnn/backends/WorkloadFactory.hpp
+++ b/src/armnn/backends/WorkloadFactory.hpp
@@ -8,13 +8,14 @@
#include <memory>
#include "armnn/TensorFwd.hpp"
#include "OutputHandler.hpp"
+#include <boost/optional.hpp>
namespace armnn
{
class Layer;
-// Workload factory interface for compute backends
+// Workload factory interface for compute backends.
class IWorkloadFactory
{
public:
@@ -25,9 +26,16 @@ public:
/// Informs the memory manager that the network is finalized and ready for execution.
virtual void Finalize() { }
- static bool IsLayerSupported(Compute compute, const Layer& layer, DataType dataType,
+ /// Inform the memory manager to release the memory
+ virtual void Release() { }
+
+ /// Inform the memory manager to acquire memory
+ virtual void Acquire() { }
+
+ static bool IsLayerSupported(Compute compute, const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported);
+ static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
std::string& outReasonIfUnsupported);
- static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported);
virtual bool SupportsSubTensors() const = 0;
@@ -103,6 +111,15 @@ public:
virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
const WorkloadInfo& info) const = 0;
+
+ virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const = 0;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const = 0;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const = 0;
};
} //namespace armnn
diff --git a/src/armnn/backends/WorkloadUtils.hpp b/src/armnn/backends/WorkloadUtils.hpp
new file mode 100644
index 0000000000..f21c78558e
--- /dev/null
+++ b/src/armnn/backends/WorkloadUtils.hpp
@@ -0,0 +1,139 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "armnn/Tensor.hpp"
+#include "ITensorHandle.hpp"
+
+#include <boost/cast.hpp>
+
+namespace armnn
+{
+namespace
+{
+template<typename ArrayType, typename Arg>
+void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg)
+{
+ if (idx >= num)
+ {
+ return;
+ }
+
+ arg = array[(num - 1) - idx];
+ idx++;
+};
+
+template<typename T, typename ArrayType, typename ...Args>
+void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args)
+{
+ AssignValues(num, idx, array, assignee);
+
+ AssignValues(num, idx, array, args...);
+}
+} // namespace
+
+template<typename CopyFunc>
+void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy)
+{
+ static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyTensorContents");
+
+ TensorShape srcStrides = srcTensor->GetStrides();
+ const TensorShape& srcShape = srcTensor->GetShape();
+ TensorShape dstStrides = dstTensor->GetStrides();
+ const TensorShape& dstShape = dstTensor->GetShape();
+
+ size_t srcBatches = 1;
+ size_t srcChannels = 1;
+ size_t srcHeight = 1;
+ size_t srcWidth = 1;
+ AssignValues(srcShape.GetNumDimensions(),0, srcShape,
+ srcWidth,
+ srcHeight,
+ srcChannels,
+ srcBatches);
+
+ size_t srcBatchStride = 0;
+ size_t srcChannelStride = 0;
+ size_t srcHeightStride = 0;
+ size_t srcWidthStride = 0;
+ AssignValues(srcStrides.GetNumDimensions(),0, srcStrides,
+ srcWidthStride,
+ srcHeightStride,
+ srcChannelStride,
+ srcBatchStride);
+
+ size_t dstBatches = 1;
+ size_t dstChannels = 1;
+ size_t dstHeight = 1;
+ size_t dstWidth = 1;
+ AssignValues(dstShape.GetNumDimensions(),0, dstShape,
+ dstWidth,
+ dstHeight,
+ dstChannels,
+ dstBatches);
+
+ size_t dstBatchStride = 0;
+ size_t dstChannelStride = 0;
+ size_t dstHeightStride = 0;
+ size_t dstWidthStride = 0;
+ AssignValues(dstStrides.GetNumDimensions(),0, dstStrides,
+ dstWidthStride,
+ dstHeightStride,
+ dstChannelStride,
+ dstBatchStride);
+
+ auto srcData = static_cast<const uint8_t*>(srcTensor->Map());
+ auto dstData = static_cast<uint8_t*>(dstTensor->Map());
+
+ size_t copyLength = std::min(srcWidth*srcWidthStride, dstWidth*dstWidthStride);
+ size_t copyHeight = std::min(srcHeight, dstHeight);
+ size_t copyChannels = std::min(srcChannels, dstChannels);
+ size_t copyBatches = std::min(srcBatches, dstBatches);
+
+ for(unsigned int b=0; b < copyBatches; ++b)
+ {
+ auto srcPtrBatch = srcData;
+ auto dstPtrBatch = dstData;
+ for (unsigned int c=0; c< copyChannels; ++c)
+ {
+ auto srcPtrChannel = srcData;
+ auto dstPtrChannel = dstData;
+ for (unsigned int h=0; h < copyHeight; ++h)
+ {
+ copy(dstData, srcData, copyLength);
+ dstData += dstHeightStride;
+ srcData += srcHeightStride;
+ }
+ dstData += (static_cast<long>(dstChannelStride) - (dstData - dstPtrChannel));
+ srcData += (static_cast<long>(srcChannelStride) - (srcData - srcPtrChannel));
+ }
+ dstData += (static_cast<long>(dstBatchStride)-(dstData - dstPtrBatch));
+ srcData += (static_cast<long>(srcBatchStride)-(srcData - srcPtrBatch));
+ }
+
+ srcTensor->Unmap();
+ dstTensor->Unmap();
+}
+
+template <typename SrcTensorHandleType, typename DstTensorHandleType, typename DescriptorType>
+void GatherTensorHandlePairs(const DescriptorType& descriptor,
+ std::vector<std::pair<SrcTensorHandleType*, DstTensorHandleType*>>& tensorHandlePairs)
+{
+ const unsigned int numInputs = static_cast<unsigned int>(descriptor.m_Inputs.size());
+ tensorHandlePairs.reserve(numInputs);
+
+ for (unsigned int i = 0; i < numInputs; ++i)
+ {
+ SrcTensorHandleType* const srcTensorHandle = boost::polymorphic_downcast<SrcTensorHandleType*>(
+ descriptor.m_Inputs[i]);
+ DstTensorHandleType* const dstTensorHandle = boost::polymorphic_downcast<DstTensorHandleType*>(
+ descriptor.m_Outputs[i]);
+
+ tensorHandlePairs.emplace_back(srcTensorHandle, dstTensorHandle);
+ }
+}
+
+} //namespace armnn \ No newline at end of file
diff --git a/src/armnn/backends/test/ActivationFixture.hpp b/src/armnn/backends/test/ActivationFixture.hpp
index a67a110354..69f3c8be05 100644
--- a/src/armnn/backends/test/ActivationFixture.hpp
+++ b/src/armnn/backends/test/ActivationFixture.hpp
@@ -41,7 +41,7 @@ struct ActivationFixture
armnn::TensorInfo inputTensorInfo;
armnn::TensorInfo outputTensorInfo;
- // parameters used by some of the activation functions
+ // Parameters used by some of the activation functions.
float a = 0.234f;
float b = -12.345f;
};
diff --git a/src/armnn/backends/test/ActivationTestImpl.hpp b/src/armnn/backends/test/ActivationTestImpl.hpp
index 255a00ef0b..e699b2289b 100644
--- a/src/armnn/backends/test/ActivationTestImpl.hpp
+++ b/src/armnn/backends/test/ActivationTestImpl.hpp
@@ -53,7 +53,7 @@ LayerTestResult<T, 4> BoundedReLuTestCommon(armnn::IWorkloadFactory& workloadFac
std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
- // Setup bounded ReLu
+ // Setup bounded ReLu.
armnn::ActivationQueueDescriptor descriptor;
armnn::WorkloadInfo workloadInfo;
AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
@@ -94,7 +94,7 @@ LayerTestResult<float, 4> BoundedReLuUpperAndLowerBoundTest(armnn::IWorkloadFact
0.999f, 1.2f, 0.89f, 6.1f,
};
- // Calculated manually
+ // Calculated manually.
std::vector<float> output = std::vector<float>{
-1.0f, 0.1f, 0.5f, 1.0f,
0.786f, 0.9875f, -1.0f, 0.384f,
@@ -122,7 +122,7 @@ LayerTestResult<float, 4> BoundedReLuUpperBoundOnlyTest(armnn::IWorkloadFactory&
0.999f, 1.2f, 0.89f, 6.1f,
};
- // Calculated manually
+ // Calculated manually.
std::vector<float> output = std::vector<float>{
0.0f, 0.1f, 0.5f, 6.0f,
0.786f, 5.9875f, 0.0f, 0.384f,
@@ -147,7 +147,7 @@ LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperBoundOnlyTest(armnn::IWorkloadF
251, 8, 92
};
- // Calculated manually
+ // Calculated manually.
std::vector<uint8_t> output = std::vector<uint8_t>{
0, 122, 0,
255, 0, 58
@@ -176,7 +176,7 @@ LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperAndLowerBoundTest(armnn::IWorkl
251, 8, 92
};
- // Calculated manually
+ // Calculated manually.
std::vector<uint8_t> output = std::vector<uint8_t>{
51, 192, 32,
192, 32, 92
@@ -186,7 +186,7 @@ LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperAndLowerBoundTest(armnn::IWorkl
float inputScale = 0.0125f;
return BoundedReLuTestCommon(workloadFactory, 1.0f, -1.0f,
- inputScale, inputOffset, inputScale, inputOffset, // input/output scale & offset same
+ inputScale, inputOffset, inputScale, inputOffset, // Input/output scale & offset same.
input, output,
inputWidth, inputHeight, inputChannels, inputBatchSize);
}
@@ -229,13 +229,14 @@ boost::multi_array<float, 4> BoundedReLuRandomInputTest(armnn::IWorkloadFactory&
boost::multi_array<float, 4> output(GetTensorShapeAsArray<4>(outputTensorInfo));
- // min/max random values passed to MakeRandomTensor are purposely outside of the ReLu range [lowerBound, upperBound]
+ // Min/max random values passed to MakeRandomTensor are purposely outside of the ReLu
+ // range [lowerBound, upperBound].
auto input = MakeRandomTensor<float, 4>(inputTensorInfo, 4605828, lowerBound - 5.0f, upperBound * 2.0f);
std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
- // Setup bounded ReLu
+ // Set up bounded ReLu.
armnn::ActivationQueueDescriptor descriptor;
armnn::WorkloadInfo workloadInfo;
AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
@@ -308,7 +309,7 @@ LayerTestResult<T,4> ConstantLinearActivationTestCommon(armnn::IWorkloadFactory&
std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
- // Do linear activation that should leave tensor unchanged
+ // Do linear activation that should leave the tensor unchanged.
armnn::ActivationQueueDescriptor data;
armnn::WorkloadInfo info;
AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
@@ -329,7 +330,7 @@ LayerTestResult<T,4> ConstantLinearActivationTestCommon(armnn::IWorkloadFactory&
CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
- // Ensure output equals input
+ // Ensure output equals input.
ret.outputExpected = input;
return ret;
@@ -386,7 +387,7 @@ LayerTestResult<T, 4> SimpleActivationTest(armnn::IWorkloadFactory& workloadFact
std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
- // Setup bounded ReLu
+ // Setup bounded ReLu.
armnn::ActivationQueueDescriptor descriptor;
armnn::WorkloadInfo workloadInfo;
AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
@@ -407,7 +408,7 @@ LayerTestResult<T, 4> SimpleActivationTest(armnn::IWorkloadFactory& workloadFact
CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
- // Calculated manually
+ // Calculated manually.
result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData));
return result;
@@ -423,7 +424,7 @@ LayerTestResult<T, 4> SimpleSigmoidTestCommon(armnn::IWorkloadFactory& workloadF
1.0f, 2.0f, 3.0f, 4.0f
};
- // Calculate output values for input
+ // Calculate output values for input.
auto f = [](float value)
{
return 1.0f / (1.0f + std::exp(-value));
diff --git a/src/armnn/backends/test/ArmComputeCl.cpp b/src/armnn/backends/test/ArmComputeCl.cpp
index ae42d03ee3..d0cb7243c3 100644
--- a/src/armnn/backends/test/ArmComputeCl.cpp
+++ b/src/armnn/backends/test/ArmComputeCl.cpp
@@ -3,7 +3,6 @@
// See LICENSE file in the project root for full license information.
//
#include <boost/test/unit_test.hpp>
-
#include "test/TensorHelpers.hpp"
#include "LayerTests.hpp"
@@ -13,6 +12,7 @@
#include "backends/RefWorkloadFactory.hpp"
#include "backends/ClLayerSupport.hpp"
#include "ActivationFixture.hpp"
+#include "ClContextControlFixture.hpp"
#include <arm_compute/core/CL/CLKernelLibrary.h>
#include <arm_compute/runtime/CL/CLScheduler.h>
@@ -21,7 +21,7 @@
#include "test/UnitTests.hpp"
-BOOST_AUTO_TEST_SUITE(Compute_ArmComputeCl)
+BOOST_FIXTURE_TEST_SUITE(Compute_ArmComputeCl, ClContextControlFixture)
using FactoryType = armnn::ClWorkloadFactory;
// ============================================================================
@@ -65,27 +65,24 @@ ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dDepthMul1Uint8, DepthwiseConv
ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dAsymmetric, DepthwiseConvolution2dAsymmetricTest, true)
ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dAsymmetric, DepthwiseConvolution2dAsymmetricTest, false)
-// Splitter
-BOOST_AUTO_TEST_CASE(SimpleSplitter)
+// Softmax
+BOOST_AUTO_TEST_CASE(Softmax4dSupport)
{
- armnn::ClWorkloadFactory workloadFactory;
- auto testResult = SplitterTest(workloadFactory);
- for (unsigned int i = 0; i < testResult.size(); ++i)
- {
- BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected));
- }
-}
+ const unsigned int numDimensions = 4u;
+ std::array<unsigned int, numDimensions> dimensionSizes;
+ dimensionSizes.fill(1u);
-BOOST_AUTO_TEST_CASE(SimpleSplitterUint8)
-{
- armnn::ClWorkloadFactory workloadFactory;
- auto testResult = SplitterUint8Test(workloadFactory);
- for (unsigned int i = 0; i < testResult.size(); ++i)
- {
- BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected));
- }
+ const armnn::TensorInfo inputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32);
+ const armnn::TensorInfo outputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32);
+
+ // 4D Softmax should be reported as unsupported on the CL backend
+ BOOST_TEST(!armnn::IsSoftmaxSupportedCl(inputInfo, outputInfo, armnn::SoftmaxDescriptor()));
}
+// Splitter
+ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest)
+ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test)
+
ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest)
ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test)
@@ -209,6 +206,19 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test)
ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test)
ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test)
+// Lstm
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32WithCifgWithPeepholeNoProjection,
+ LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest)
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgNoPeepholeNoProjection,
+ LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest)
+ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgWithPeepholeWithProjection,
+ LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest)
+
+// Convert from Float16 to Float32
+ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test)
+// Convert from Float32 to Float16
+ARMNN_AUTO_TEST_CASE(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test)
+
// ============================================================================
// COMPARE tests
diff --git a/src/armnn/backends/test/ArmComputeNeon.cpp b/src/armnn/backends/test/ArmComputeNeon.cpp
index 0a78b75e2e..12947ca77a 100644
--- a/src/armnn/backends/test/ArmComputeNeon.cpp
+++ b/src/armnn/backends/test/ArmComputeNeon.cpp
@@ -54,7 +54,7 @@ armnn::Convolution2dDescriptor MakeConv2dDesc(uint32_t strideX, uint32_t strideY
BOOST_AUTO_TEST_CASE(Conv2dUtils)
{
- // the only preferred Neon convolution is 1x1 with padding=0 and stride size {1,2,3}
+ // The only preferred Neon convolution is 1x1 with padding=0 and stride size {1,2,3}.
armnn::TensorShape shape1x1({ 1,1,1,1 });
armnn::TensorInfo info1x1(shape1x1, armnn::DataType::Float32);
BOOST_TEST(armnn::IsNeonDirectConvolutionPreferred(info1x1, MakeConv2dDesc(1, 1)));
@@ -98,49 +98,133 @@ armnn::DepthwiseConvolution2dDescriptor MakeDepthwiseConv2dDesc(uint32_t strideX
uint32_t depthMultiplier = 1, uint32_t padLeft = 0, uint32_t padRight = 0,
uint32_t padTop = 0, uint32_t padBottom = 0)
{
+ boost::ignore_unused(depthMultiplier);
+
armnn::DepthwiseConvolution2dDescriptor desc;
+
desc.m_PadLeft = padLeft;
desc.m_PadRight = padRight;
+
desc.m_PadTop = padTop;
desc.m_PadBottom = padBottom;
desc.m_StrideX = strideX;
desc.m_StrideY = strideY;
- desc.m_BiasEnabled = true;
+ desc.m_BiasEnabled = false;
+
return desc;
}
+armnn::TensorInfo CreateOutputTensorInfo(const armnn::TensorInfo& inputInfo,
+ const armnn::TensorInfo& weightsInfo,
+ const armnn::DepthwiseConvolution2dDescriptor& descriptor,
+ armnn::DataType dataType)
+{
+ const armnn::TensorShape& inputShape = inputInfo.GetShape();
+ const armnn::TensorShape& filterShape = weightsInfo.GetShape();
+
+ unsigned int inWidth = inputShape[3];
+ unsigned int inHeight = inputShape[2];
+ unsigned int inBatchSize = inputShape[0];
+
+ unsigned int filterWidth = filterShape[3];
+ unsigned int readWidth = (inWidth + descriptor.m_PadLeft + descriptor.m_PadRight) - (filterWidth);
+ unsigned int outWidth = 1u + (readWidth / descriptor.m_StrideX);
+
+ unsigned int filterHeight = filterShape[2];
+ unsigned int readHeight = (inHeight + descriptor.m_PadTop + descriptor.m_PadBottom) - (filterHeight);
+ unsigned int outHeight = 1u + (readHeight / descriptor.m_StrideY);
+ unsigned int depthMultiplier = filterShape[0];
+
+ unsigned int outChannels = filterShape[1] * depthMultiplier;
+ unsigned int outBatchSize = inBatchSize;
+
+ armnn::TensorShape outputShape({outBatchSize, outChannels, outHeight, outWidth});
+ return armnn::TensorInfo(outputShape, dataType);
+}
}
BOOST_AUTO_TEST_CASE(DepthwiseConv2dUtils)
{
- armnn::TensorInfo inputInfo({ 1, 1, 10, 10 }, armnn::DataType::Float32);
- armnn::TensorInfo weightsInfo3x3({ 1, 1, 3, 3 }, armnn::DataType::Float32);
+ const armnn::DataType dataType = armnn::DataType::Float32;
+
+ armnn::TensorInfo inputInfo({1, 1, 10, 10 }, dataType);
+ armnn::TensorInfo outputInfo;
+ armnn::TensorInfo weightsInfo3x3({ 1, 1, 3, 3 }, dataType);
+ armnn::TensorInfo biasesInfo;
+
+ armnn::DepthwiseConvolution2dDescriptor descriptor;
// Strides supported: 1,2,3
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 2), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 3), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 1), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 2), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 3), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 1), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 2), weightsInfo3x3));
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 3), weightsInfo3x3));
-
- // Unsupported stride
- BOOST_TEST(!armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(4, 1), weightsInfo3x3));
+ descriptor = MakeDepthwiseConv2dDesc(1, 1);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(1, 2);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(1, 3);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(2, 1);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(2, 2);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(2, 3);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(3, 1);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(3, 2);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ descriptor = MakeDepthwiseConv2dDesc(3, 3);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
+
+ // Supported stride 4
+ descriptor = MakeDepthwiseConv2dDesc(4, 1);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
// Supported weights shape 1x1
armnn::TensorInfo weightsInfo1x1({ 1, 1, 1, 1 }, armnn::DataType::Float32);
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo1x1));
+ descriptor = MakeDepthwiseConv2dDesc(1, 1);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo1x1, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo1x1, biasesInfo));
// Supported shape 2x2
armnn::TensorInfo weightsInfo2x2({ 1, 1, 2, 2 }, armnn::DataType::Float32);
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo2x2));
+ descriptor = MakeDepthwiseConv2dDesc(1, 1);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo2x2, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo2x2, biasesInfo));
// Asymmetric padding
- BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1, 1, 1, 2, 1, 2),
- weightsInfo3x3));
+ descriptor = MakeDepthwiseConv2dDesc(1, 1, 1, 1, 2, 1, 2);
+ outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType);
+ BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor,
+ weightsInfo3x3, biasesInfo));
}
// Pooling
@@ -201,27 +285,24 @@ ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2Uint8, SimpleSoftmaxUint8Test, 2.0f)
ARMNN_AUTO_TEST_CASE(ReLu1Uint8, BoundedReLuUint8UpperAndLowerBoundTest)
ARMNN_AUTO_TEST_CASE(ReLu6Uint8, BoundedReLuUint8UpperBoundOnlyTest)
-// Splitter
-BOOST_AUTO_TEST_CASE(SimpleSplitter)
+// Softmax
+BOOST_AUTO_TEST_CASE(Softmax4dSupport)
{
- armnn::NeonWorkloadFactory workloadFactory;
- auto testResult = SplitterTest(workloadFactory);
- for (unsigned int i = 0; i < testResult.size(); ++i)
- {
- BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected));
- }
-}
+ const unsigned int numDimensions = 4u;
+ std::array<unsigned int, numDimensions> dimensionSizes;
+ dimensionSizes.fill(1u);
-BOOST_AUTO_TEST_CASE(SimpleSplitterUint8)
-{
- armnn::NeonWorkloadFactory workloadFactory;
- auto testResult = SplitterUint8Test(workloadFactory);
- for (unsigned int i = 0; i < testResult.size(); ++i)
- {
- BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected));
- }
+ const armnn::TensorInfo inputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32);
+ const armnn::TensorInfo outputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32);
+
+ // 4D Softmax should be reported as unsupported on the NEON backend
+ BOOST_TEST(!armnn::IsSoftmaxSupportedNeon(inputInfo, outputInfo, armnn::SoftmaxDescriptor()));
}
+// Splitter
+ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest)
+ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test)
+
ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest)
ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test)
@@ -375,5 +456,4 @@ ARMNN_COMPARE_REF_FIXTURE_TEST_CASE(CompareSqrtActivationWithReference, Positive
ARMNN_COMPARE_REF_FIXTURE_TEST_CASE(CompareSquareActivationWithReference, ActivationFixture,
CompareActivationTest, armnn::ActivationFunction::Square, 5u)
-
BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/backends/test/BatchNormTestImpl.hpp b/src/armnn/backends/test/BatchNormTestImpl.hpp
index 861ef6b053..82e6e86747 100644
--- a/src/armnn/backends/test/BatchNormTestImpl.hpp
+++ b/src/armnn/backends/test/BatchNormTestImpl.hpp
@@ -52,7 +52,7 @@ LayerTestResult<T,4> BatchNormTestImpl(armnn::IWorkloadFactory& workloadFactory,
4.f, 1.f,
-2.f, 4.f
}));
- // these values are per-channel of the input
+ // These values are per-channel of the input.
auto mean = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3, -2}));
auto variance = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {4, 9}));
auto beta = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3, 2}));
@@ -82,8 +82,8 @@ LayerTestResult<T,4> BatchNormTestImpl(armnn::IWorkloadFactory& workloadFactory,
data.m_Gamma = &gammaTensor;
data.m_Parameters.m_Eps = 0.0f;
- // for each channel:
- // substract mean, divide by standard deviation (with an epsilon to avoid div by 0)
+ // For each channel:
+ // substract mean, divide by standard deviation (with an epsilon to avoid div by 0),
// multiply by gamma and add beta
ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
QuantizedVector<T>(qScale, qOffset,
diff --git a/src/armnn/backends/test/ClContextControlFixture.hpp b/src/armnn/backends/test/ClContextControlFixture.hpp
new file mode 100644
index 0000000000..13c061f818
--- /dev/null
+++ b/src/armnn/backends/test/ClContextControlFixture.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClContextControl.hpp"
+
+template<bool ProfilingEnabled>
+struct ClContextControlFixtureBase
+{
+ // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case
+ ClContextControlFixtureBase() : m_ClContextControl(nullptr, ProfilingEnabled) {}
+ ~ClContextControlFixtureBase() {}
+
+ armnn::ClContextControl m_ClContextControl;
+};
+
+using ClContextControlFixture = ClContextControlFixtureBase<false>;
+using ClProfilingContextControlFixture = ClContextControlFixtureBase<true>;
diff --git a/src/armnn/backends/test/Conv2dTestImpl.hpp b/src/armnn/backends/test/Conv2dTestImpl.hpp
index 0c34beaa33..43297880f8 100644
--- a/src/armnn/backends/test/Conv2dTestImpl.hpp
+++ b/src/armnn/backends/test/Conv2dTestImpl.hpp
@@ -32,7 +32,7 @@ struct FullyConnectedBiasTypeForInputType<uint8_t>
using Type = int32_t;
};
-// Modifies a std::vector in-place using a specified bias
+// Modifies a std::vector in-place using a specified bias.
template<typename T, typename B>
void ApplyBias(std::vector<T>& v, float vScale, int32_t vOffset,
const std::vector<B>& bias, float bScale, int32_t bOffset, uint32_t w, uint32_t h)
@@ -42,7 +42,7 @@ void ApplyBias(std::vector<T>& v, float vScale, int32_t vOffset,
BOOST_ASSERT_MSG((armnn::IsQuantizedType<B>() && bScale != 0.0f) || (!armnn::IsQuantizedType<B>()),
"Invalid type and parameter combination.");
- // Note we need to dequantize and re-quantize the image value and the bias
+ // Note we need to dequantize and re-quantize the image value and the bias.
for (uint32_t i = 0; i < bias.size(); ++i)
{
float dBias = SelectiveDequantize(bias[i], bScale, bOffset);
@@ -90,15 +90,15 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl
bool biasEnabled = bias.size() > 0;
- // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches)
+ // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches).
BOOST_ASSERT(inputNum == 1);
BOOST_ASSERT(outputNum == 1);
- // If a bias is used, its size must equal the number of output channels
+ // If a bias is used, its size must equal the number of output channels.
BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels);
- // Note these tensors will use two (identical) batches
+ // Note these tensors will use two (identical) batches.
armnn::TensorInfo inputTensorInfo({2*inputNum, inputChannels, inputHeight, inputWidth}, armnn::GetDataType<T>());
armnn::TensorInfo outputTensorInfo({2*outputNum, outputChannels, outputHeight, outputWidth},
armnn::GetDataType<T>());
@@ -120,7 +120,7 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl
LayerTestResult<T, 4> ret(outputTensorInfo);
- // Construct input data - Two batches of the same input image
+ // Construct input data - two batches of the same input image.
std::vector<T> inputImage;
inputImage.assign(input.data(), input.data() + 1*inputChannels*inputHeight*inputWidth);
std::vector<T> inputData;
@@ -131,7 +131,7 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl
std::vector<T> outputImage;
outputImage.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth);
- // Apply bias to output image if enabled
+ // Apply bias to output image if it is enabled.
if(biasEnabled)
{
std::vector<T> biasV;
@@ -141,14 +141,14 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl
outputWidth, outputHeight);
}
- // Construct expected output data - two identical images
+ // Construct expected output data - two identical images.
std::vector<T> outputData;
outputData.insert(outputData.end(), outputImage.begin(), outputImage.end());
outputData.insert(outputData.end(), outputImage.begin(), outputImage.end());
ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
- // todo: nontrivial padding and strides
+ // Todo: nontrivial padding and strides.
uint32_t strideX = 1;
uint32_t strideY = 1;
@@ -171,7 +171,7 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl
AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
data.m_Weight = &weightsTensor;
- data.m_Bias = &biasTensor; // still set this whether or not bias is enabled - can be a source of bugs
+ data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - can be a source of bugs.
data.m_Parameters.m_StrideX = strideX;
data.m_Parameters.m_StrideY = strideY;
data.m_Parameters.m_PadLeft = padLeft;
@@ -222,11 +222,11 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF
unsigned int outputHeight = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]);
unsigned int outputWidth = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]);
- // If a bias is used, its size must equal the number of output channels
+ // If a bias is used, its size must equal the number of output channels.
bool biasEnabled = bias.size() > 0;
BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels);
- // create the tensors
+ // Creates the tensors.
armnn::TensorInfo inputTensorInfo({inputNum, inputChannels, inputHeight, inputWidth}, armnn::GetDataType<T>());
armnn::TensorInfo outputTensorInfo({outputNum, outputChannels, outputHeight, outputWidth},
armnn::GetDataType<T>());
@@ -246,12 +246,12 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF
biasDesc.SetQuantizationOffset(0);
}
- // Construct the input data
+ // Construct the input data.
std::vector<T> inputData;
inputData.assign(input.data(), input.data() + inputChannels*inputHeight*inputWidth);
auto batchedInput = MakeTensor<T, 4>(inputTensorInfo, inputData);
- // Construct the output data, with bias applied, as appropriate
+ // Construct the output data, with bias applied, as appropriate.
std::vector<T> outputData;
outputData.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth);
if (biasEnabled)
@@ -280,7 +280,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF
armnn::DepthwiseConvolution2dQueueDescriptor data;
data.m_Weight = &weightsTensor;
- data.m_Bias = &biasTensor; // still set this whether or not bias is enabled - can be a source of bugs
+ data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - it can be a source of bugs.
data.m_Parameters.m_StrideX = strideX;
data.m_Parameters.m_StrideY = strideY;
data.m_Parameters.m_PadLeft = padLeft;
@@ -372,14 +372,14 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(armnn::IWorkloadFa
-1.f, 0.f, -1.f,
})));
- // manually calculated
+ // Manually calculated.
std::vector<T> outputImage(
QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
outputTensorInfo.GetQuantizationOffset(),
{0.f, 0.f})
);
- // Optionally apply bias to output image
+ // Optionally apply bias to output image.
if(biasEnabled)
{
ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
@@ -405,7 +405,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(armnn::IWorkloadFa
AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
data.m_Weight = &weightsTensor;
- data.m_Bias = &biasTensor; // still set this whether or not bias is enabled
+ data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled.
data.m_Parameters.m_StrideX = 1;
data.m_Parameters.m_StrideY = 1;
data.m_Parameters.m_PadLeft = 0;
@@ -520,7 +520,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo
0, 0, 0
})));
- // manually calculated
+ // Manually calculated.
std::vector<T> outputImage = std::vector<T>(
QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), {
3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f,
@@ -552,7 +552,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
}));
- // Optionally apply bias to output image
+ // Optionally apply bias to output image.
if(biasEnabled)
{
ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
@@ -578,7 +578,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo
AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
data.m_Weight = &weightsTensor;
- data.m_Bias = &biasTensor; // still set this whether or not bias is enabled
+ data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled.
data.m_Parameters.m_StrideX = 2;
data.m_Parameters.m_StrideY = 1;
data.m_Parameters.m_PadLeft = 0;
@@ -609,7 +609,7 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact
{
using B = typename FullyConnectedBiasTypeForInputType<T>::Type;
- // until we have a specialist 1D convolution layer, we can fake one using
+ // Until we have a specialist 1D convolution layer, we can fake one using
// 2D convolution with the final dimension set to 1.
// I don't anticipate this being particularly slow, given that convolution is implemented
// as a matrix multiplication, at which point dimension doesn't matter.
@@ -617,11 +617,11 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact
unsigned int batchSize = 1;
unsigned int inputChannels = 2;
unsigned int outputChannels = 3;
- unsigned int inputSize = 5; // the 1D size (could view as 'width' or 'height')
+ unsigned int inputSize = 5; // The 1D size (could view as 'width' or 'height').
unsigned int kernelSize = 3;
unsigned int padSize = 2;
unsigned int stride = 1;
- unsigned int outputSize = 7; // (inputSize + 2 * padSize - kernelSize + 1) / stride
+ unsigned int outputSize = 7; // (inputSize + 2 * padSize - kernelSize + 1) / stride.
armnn::TensorInfo inputInfo({batchSize, inputChannels, inputSize, 1}, armnn::GetDataType<T>());
armnn::TensorInfo outputInfo({batchSize, outputChannels, outputSize, 1}, armnn::GetDataType<T>());
@@ -671,7 +671,7 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact
2.5f, -1.0f + 3.0f, 1.25f - 3.2f + 2.5f, -1.0f - 5.0f, 1.25f + 0.5f - 2.0f, -3.0f, 0.5f
}));
- // Optionally apply bias to output image
+ // Optionally apply bias to output image.
if(biasEnabled)
{
ApplyBias(outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(),
@@ -712,7 +712,7 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact
workloadFactory.Finalize();
workload->Execute();
- // output
+ // Output
LayerTestResult<T,4> ret(outputInfo);
CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
ret.outputExpected = MakeTensor<T, 4>(outputInfo, outputData);
diff --git a/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp b/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp
new file mode 100644
index 0000000000..89faaf9fe6
--- /dev/null
+++ b/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp
@@ -0,0 +1,55 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/Tensor.hpp>
+#include <armnn/TypesUtils.hpp>
+
+#include <backends/WorkloadInfo.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <Half.hpp>
+
+LayerTestResult<float, 4> SimpleConvertFp16ToFp32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+ using namespace half_float::literal;
+
+ const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+ const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+
+ auto input = MakeTensor<armnn::Half, 4>(inputTensorInfo,
+ { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
+ 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h });
+
+ LayerTestResult<float, 4> ret(outputTensorInfo);
+ ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo,
+ { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+ 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f });
+
+ std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+ armnn::ConvertFp16ToFp32QueueDescriptor data;
+ armnn::WorkloadInfo info;
+ AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+ AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+ std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertFp16ToFp32(data, info);
+
+ inputHandle->Allocate();
+ outputHandle->Allocate();
+
+ CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+ workload->Execute();
+
+ CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+ return ret;
+}
diff --git a/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp b/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp
new file mode 100644
index 0000000000..1d9bee577c
--- /dev/null
+++ b/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp
@@ -0,0 +1,55 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/Tensor.hpp>
+#include <armnn/TypesUtils.hpp>
+
+#include <backends/WorkloadInfo.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <Half.hpp>
+
+LayerTestResult<armnn::Half, 4> SimpleConvertFp32ToFp16Test(armnn::IWorkloadFactory& workloadFactory)
+{
+ using namespace half_float::literal;
+
+ const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+ const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+
+ auto input = MakeTensor<float, 4>(inputTensorInfo,
+ { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+ 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f });
+
+ LayerTestResult<armnn::Half, 4> ret(outputTensorInfo);
+ ret.outputExpected = MakeTensor<armnn::Half, 4>(outputTensorInfo,
+ { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
+ 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h });
+
+ std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+ armnn::ConvertFp32ToFp16QueueDescriptor data;
+ armnn::WorkloadInfo info;
+ AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+ AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+ std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertFp32ToFp16(data, info);
+
+ inputHandle->Allocate();
+ outputHandle->Allocate();
+
+ CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+ workload->Execute();
+
+ CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+ return ret;
+} \ No newline at end of file
diff --git a/src/armnn/backends/test/CreateWorkloadCl.cpp b/src/armnn/backends/test/CreateWorkloadCl.cpp
index f83bb12bbe..5d4265911f 100644
--- a/src/armnn/backends/test/CreateWorkloadCl.cpp
+++ b/src/armnn/backends/test/CreateWorkloadCl.cpp
@@ -8,6 +8,7 @@
#include "backends/ClWorkloadUtils.hpp"
#include "backends/ClWorkloads.hpp"
#include "backends/ClTensorHandle.hpp"
+#include "ClContextControlFixture.hpp"
#include "test/CreateWorkloadClNeon.hpp"
@@ -17,16 +18,17 @@ boost::test_tools::predicate_result CompareIClTensorHandleShape(IClTensorHandle*
return CompareTensorHandleShape<IClTensorHandle>(tensorHandle, expectedDimensions);
}
-BOOST_AUTO_TEST_SUITE(CreateWorkloadCl)
+BOOST_FIXTURE_TEST_SUITE(CreateWorkloadCl, ClContextControlFixture)
-BOOST_AUTO_TEST_CASE(CreateActivationWorkload)
+template <typename ActivationWorkloadType, armnn::DataType DataType>
+static void ClCreateActivationWorkloadTest()
{
Graph graph;
ClWorkloadFactory factory;
- auto workload = CreateActivationWorkloadTest<ClActivationFloat32Workload>(factory, graph);
+ auto workload = CreateActivationWorkloadTest<ActivationWorkloadType, DataType>(factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest).
ActivationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -35,14 +37,24 @@ BOOST_AUTO_TEST_CASE(CreateActivationWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {1}));
}
-BOOST_AUTO_TEST_CASE(CreateAdditionWorkload)
+BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload)
+{
+ ClCreateActivationWorkloadTest<ClActivationFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateActivationFloat16Workload)
+{
+ ClCreateActivationWorkloadTest<ClActivationFloat32Workload, armnn::DataType::Float16>();
+}
+
+template <typename AdditionWorkloadType, armnn::DataType DataType>
+static void ClCreateAdditionWorkloadTest()
{
Graph graph;
ClWorkloadFactory factory;
+ auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType, DataType>(factory, graph);
- auto workload = CreateAdditionWorkloadTest<ClAdditionFloat32Workload>(factory, graph);
-
- // check that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest).
AdditionQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle1 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto inputHandle2 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[1]);
@@ -52,14 +64,26 @@ BOOST_AUTO_TEST_CASE(CreateAdditionWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3}));
}
-BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload)
+BOOST_AUTO_TEST_CASE(CreateAdditionFloat32Workload)
{
- Graph graph;
+ ClCreateAdditionWorkloadTest<ClAdditionFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateAdditionFloat16Workload)
+{
+ ClCreateAdditionWorkloadTest<ClAdditionFloat32Workload, armnn::DataType::Float16>();
+}
+
+template <typename BatchNormalizationWorkloadType, armnn::DataType DataType>
+static void ClCreateBatchNormalizationWorkloadTest()
+{
+ Graph graph;
ClWorkloadFactory factory;
- auto workload = CreateBatchNormalizationWorkloadTest<ClBatchNormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateBatchNormalizationWorkloadTest<BatchNormalizationWorkloadType, DataType>
+ (factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest).
BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -68,14 +92,57 @@ BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3, 1, 1}));
}
-template <typename Convolution2dWorkloadType>
-static void Convolution2dWorkloadTest()
+BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat32Workload)
+{
+ ClCreateBatchNormalizationWorkloadTest<ClBatchNormalizationFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat16Workload)
+{
+ ClCreateBatchNormalizationWorkloadTest<ClBatchNormalizationFloat32Workload, armnn::DataType::Float16>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateConvertFp16ToFp32Workload)
+{
+ Graph graph;
+ ClWorkloadFactory factory;
+ auto workload = CreateConvertFp16ToFp32WorkloadTest<ClConvertFp16ToFp32Workload>(factory, graph);
+
+ ConvertFp16ToFp32QueueDescriptor queueDescriptor = workload->GetData();
+ auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
+ auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
+
+ BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {3, 2, 3}));
+ BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 3}));
+ BOOST_TEST((inputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F16));
+ BOOST_TEST((outputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F32));
+}
+
+BOOST_AUTO_TEST_CASE(CreateConvertFp32ToFp16Workload)
+{
+ Graph graph;
+ ClWorkloadFactory factory;
+ auto workload = CreateConvertFp32ToFp16WorkloadTest<ClConvertFp32ToFp16Workload>(factory, graph);
+
+ ConvertFp32ToFp16QueueDescriptor queueDescriptor = workload->GetData();
+ auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
+ auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
+
+ BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {3, 2, 3}));
+ BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 3}));
+ BOOST_TEST((inputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F32));
+ BOOST_TEST((outputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F16));
+}
+
+template <typename Convolution2dWorkloadType, typename armnn::DataType DataType>
+static void ClConvolution2dWorkloadTest()
{
- Graph graph;
- ClWorkloadFactory factory;
- auto workload = CreateConvolution2dWorkloadTest<Convolution2dWorkloadType>(factory, graph);
+ Graph graph;
+ ClWorkloadFactory factory;
+ auto workload = CreateConvolution2dWorkloadTest<Convolution2dWorkloadType, DataType>
+ (factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest).
Convolution2dQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -85,18 +152,24 @@ static void Convolution2dWorkloadTest()
BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat32Workload)
{
- Convolution2dWorkloadTest<ClConvolution2dFloat32Workload>();
+ ClConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float32>();
}
+BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat16Workload)
+{
+ ClConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float16>();
+}
-template <typename Convolution2dWorkloadType>
-static void DirectConvolution2dWorkloadTest()
+
+template <typename Convolution2dWorkloadType, typename armnn::DataType DataType>
+static void ClDirectConvolution2dWorkloadTest()
{
- Graph graph;
- ClWorkloadFactory factory;
- auto workload = CreateDirectConvolution2dWorkloadTest<Convolution2dWorkloadType>(factory, graph);
+ Graph graph;
+ ClWorkloadFactory factory;
+ auto workload = CreateDirectConvolution2dWorkloadTest<Convolution2dWorkloadType, DataType>(
+ factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateDirectConvolution2dWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateDirectConvolution2dWorkloadTest).
Convolution2dQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -106,22 +179,28 @@ static void DirectConvolution2dWorkloadTest()
BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dFloat32Workload)
{
- DirectConvolution2dWorkloadTest<ClConvolution2dFloat32Workload>();
+ ClDirectConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dFloat16Workload)
+{
+ ClDirectConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float16>();
}
BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dUint8Workload)
{
- DirectConvolution2dWorkloadTest<ClConvolution2dUint8Workload>();
+ ClDirectConvolution2dWorkloadTest<ClConvolution2dUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
-BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload)
+template <typename FullyConnectedWorkloadType, typename armnn::DataType DataType>
+static void ClCreateFullyConnectedWorkloadTest()
{
- Graph graph;
+ Graph graph;
ClWorkloadFactory factory;
- auto workload =
- CreateFullyConnectedWorkloadTest<ClFullyConnectedFloat32Workload>(factory, graph);
+ auto workload =
+ CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest).
FullyConnectedQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -129,15 +208,28 @@ BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 7}));
}
-BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload)
+
+BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32WorkloadTest)
{
- Graph graph;
+ ClCreateFullyConnectedWorkloadTest<ClFullyConnectedFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat16WorkloadTest)
+{
+ ClCreateFullyConnectedWorkloadTest<ClFullyConnectedFloat32Workload, armnn::DataType::Float16>();
+}
+
+
+template <typename MultiplicationWorkloadType, typename armnn::DataType DataType>
+static void ClCreateMultiplicationWorkloadTest()
+{
+ Graph graph;
ClWorkloadFactory factory;
auto workload =
- CreateMultiplicationWorkloadTest<ClMultiplicationFloat32Workload>(factory, graph);
+ CreateMultiplicationWorkloadTest<MultiplicationWorkloadType, DataType>(factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest).
MultiplicationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle1 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto inputHandle2 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[1]);
@@ -147,14 +239,26 @@ BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3}));
}
-BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload)
+BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat32WorkloadTest)
+{
+ ClCreateMultiplicationWorkloadTest<ClMultiplicationFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat16WorkloadTest)
+{
+ ClCreateMultiplicationWorkloadTest<ClMultiplicationFloat32Workload, armnn::DataType::Float16>();
+}
+
+template <typename NormalizationWorkloadType, typename armnn::DataType DataType>
+static void ClNormalizationWorkloadTest()
{
- Graph graph;
+ Graph graph;
ClWorkloadFactory factory;
- auto workload = CreateNormalizationWorkloadTest<ClNormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateNormalizationWorkloadTest<NormalizationWorkloadType, DataType>
+ (factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest).
NormalizationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -163,14 +267,25 @@ BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 5, 5, 1}));
}
-BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload)
+BOOST_AUTO_TEST_CASE(CreateNormalizationFloat32Workload)
{
- Graph graph;
+ ClNormalizationWorkloadTest<ClNormalizationFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateNormalizationFloat16Workload)
+{
+ ClNormalizationWorkloadTest<ClNormalizationFloat32Workload, armnn::DataType::Float16>();
+}
+
+template <typename Pooling2dWorkloadType, typename armnn::DataType DataType>
+static void ClPooling2dWorkloadTest()
+{
+ Graph graph;
ClWorkloadFactory factory;
- auto workload = CreatePooling2dWorkloadTest<ClPooling2dFloat32Workload>(factory, graph);
+ auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType, DataType>(factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreatePooling2dWorkloadTest)
+ // Check that inputs/outputs are as we expect them (see definition of CreatePooling2dWorkloadTest).
Pooling2dQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -179,18 +294,28 @@ BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 2, 4}));
}
-template <typename ReshapeWorkloadType>
+BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload)
+{
+ ClPooling2dWorkloadTest<ClPooling2dFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreatePooling2dFloat16Workload)
+{
+ ClPooling2dWorkloadTest<ClPooling2dFloat32Workload, armnn::DataType::Float16>();
+}
+
+template <typename ReshapeWorkloadType, typename armnn::DataType DataType>
static void ClCreateReshapeWorkloadTest()
{
- Graph graph;
+ Graph graph;
ClWorkloadFactory factory;
- auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType>(factory, graph);
+ auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest).
ReshapeQueueDescriptor queueDescriptor = workload->GetData();
- auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
- auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
+ auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
+ auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {4, 1}));
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {4})); // Leading size 1 dimensions are collapsed by ACL.
@@ -198,38 +323,56 @@ static void ClCreateReshapeWorkloadTest()
BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload)
{
- ClCreateReshapeWorkloadTest<ClReshapeFloat32Workload>();
+ ClCreateReshapeWorkloadTest<ClReshapeFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateReshapeFloat16Workload)
+{
+ ClCreateReshapeWorkloadTest<ClReshapeFloat32Workload, armnn::DataType::Float16>();
}
BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload)
{
- ClCreateReshapeWorkloadTest<ClReshapeUint8Workload>();
+ ClCreateReshapeWorkloadTest<ClReshapeUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
-BOOST_AUTO_TEST_CASE(CreateSoftmaxWorkload)
+template <typename SoftmaxWorkloadType, typename armnn::DataType DataType>
+static void ClSoftmaxWorkloadTest()
{
- Graph graph;
+ Graph graph;
ClWorkloadFactory factory;
- auto workload = CreateSoftmaxWorkloadTest<ClSoftmaxFloat32Workload>(factory, graph);
+ auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType, DataType>(factory, graph);
- // check that inputs/outputs are as we expect them (see definition of ClSoftmaxFloat32Workload)
+ // Checks that inputs/outputs are as we expect them (see definition of ClSoftmaxFloat32Workload).
SoftmaxQueueDescriptor queueDescriptor = workload->GetData();
- auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
- auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
+ auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
+ auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {4, 1}));
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {4, 1}));
}
-BOOST_AUTO_TEST_CASE(CreateSplitterWorkload)
+
+BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32WorkloadTest)
+{
+ ClSoftmaxWorkloadTest<ClSoftmaxFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat16WorkloadTest)
+{
+ ClSoftmaxWorkloadTest<ClSoftmaxFloat32Workload, armnn::DataType::Float16>();
+}
+
+template <typename SplitterWorkloadType, typename armnn::DataType DataType>
+static void ClSplitterWorkloadTest()
{
Graph graph;
ClWorkloadFactory factory;
- auto workload = CreateSplitterWorkloadTest<ClSplitterFloat32Workload>(factory, graph);
+ auto workload = CreateSplitterWorkloadTest<SplitterWorkloadType, DataType>(factory, graph);
- // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest)
+ // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest).
SplitterQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {5, 7, 7}));
@@ -242,14 +385,25 @@ BOOST_AUTO_TEST_CASE(CreateSplitterWorkload)
auto outputHandle0 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
// NOTE: At the moment the CL collapses the tensor to a 2 dim when dimension zero = 1
- // we are raising this difference between the NEON and CL libs as an issue with the compute library team
+ // we are raising this difference between the NEON and CL libs as an issue with the compute library team.
BOOST_TEST(CompareIClTensorHandleShape(outputHandle0, {7, 7}));
}
-BOOST_AUTO_TEST_CASE(CreateSplitterMerger)
+BOOST_AUTO_TEST_CASE(CreateSplitterFloat32Workload)
+{
+ ClSplitterWorkloadTest<ClSplitterFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateSplitterFloat16Workload)
{
- // Test that it is possible to decide which output of the splitter layer
- // should be lined to which input of the merger layer
+ ClSplitterWorkloadTest<ClSplitterFloat32Workload, armnn::DataType::Float16>();
+}
+
+template <typename SplitterWorkloadType, typename MergerWorkloadType, typename armnn::DataType DataType>
+static void ClSplitterMergerTest()
+{
+ // Tests that it is possible to decide which output of the splitter layer
+ // should be lined to which input of the merger layer.
// We test that is is possible to specify 0th output
// of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input
// of the merger.
@@ -258,12 +412,13 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger)
ClWorkloadFactory factory;
auto workloads =
- CreateSplitterMergerWorkloadTest<ClSplitterFloat32Workload, ClMergerFloat32Workload>(factory, graph);
+ CreateSplitterMergerWorkloadTest<SplitterWorkloadType, MergerWorkloadType, DataType>
+ (factory, graph);
auto wlSplitter = std::move(workloads.first);
auto wlMerger = std::move(workloads.second);
- //check that the index of inputs/outputs matches what we declared on InputDescriptor construction.
+ //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction.
armnn::ClSubTensorHandle* sOut0 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
armnn::ClSubTensorHandle* sOut1 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
armnn::ClSubTensorHandle* mIn0 = dynamic_cast<armnn::ClSubTensorHandle*>(wlMerger->GetData().m_Inputs[0]);
@@ -274,22 +429,33 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger)
BOOST_TEST(mIn0);
BOOST_TEST(mIn1);
- //fliped order of inputs/outputs
+ //Fliped order of inputs/outputs.
bool validDataPointers = (sOut0 == mIn1) && (sOut1 == mIn0);
BOOST_TEST(validDataPointers);
- //also make sure that the inputs are subtensors of one tensor and outputs are sub tensors of another tensor
+ //Also make sure that the inputs are subtensors of one tensor and outputs are sub tensors of another tensor.
bool validSubTensorParents = (mIn0->GetTensor().parent() == mIn1->GetTensor().parent())
&& (sOut0->GetTensor().parent() == sOut1->GetTensor().parent());
BOOST_TEST(validSubTensorParents);
}
+BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat32Workload)
+{
+ ClSplitterMergerTest<ClSplitterFloat32Workload, ClMergerFloat32Workload, armnn::DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat16Workload)
+{
+ ClSplitterMergerTest<ClSplitterFloat32Workload, ClMergerFloat32Workload, armnn::DataType::Float16>();
+}
+
+
BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs)
{
// Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer.
- // We create a splitter with two outputs. That each of those outputs is used by two different activation layers
+ // We create a splitter with two outputs. That each of those outputs is used by two different activation layers.
Graph graph;
ClWorkloadFactory factory;
@@ -300,9 +466,10 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs)
std::unique_ptr<ClActivationFloat32Workload> wlActiv1_1;
CreateSplitterMultipleInputsOneOutputWorkloadTest<ClSplitterFloat32Workload,
- ClActivationFloat32Workload>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1);
+ ClActivationFloat32Workload, armnn::DataType::Float32>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1,
+ wlActiv1_0, wlActiv1_1);
- //check that the index of inputs/outputs matches what we declared on InputDescriptor construction.
+ //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction.
armnn::ClSubTensorHandle* sOut0 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
armnn::ClSubTensorHandle* sOut1 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
armnn::ClSubTensorHandle* activ0_0Im = dynamic_cast<armnn::ClSubTensorHandle*>(wlActiv0_0->GetData().m_Inputs[0]);
@@ -327,17 +494,18 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs)
BOOST_AUTO_TEST_CASE(CreateMemCopyWorkloadsCl)
{
ClWorkloadFactory factory;
- CreateMemCopyWorkloads<CopyFromCpuToClWorkload,CopyFromClToCpuWorkload,IClTensorHandle>(factory);
+ CreateMemCopyWorkloads<IClTensorHandle>(factory);
}
BOOST_AUTO_TEST_CASE(CreateL2NormalizationWorkload)
{
- Graph graph;
+ Graph graph;
ClWorkloadFactory factory;
- auto workload = CreateL2NormalizationWorkloadTest<ClL2NormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateL2NormalizationWorkloadTest<ClL2NormalizationFloat32Workload, armnn::DataType::Float32>
+ (factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest).
L2NormalizationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
@@ -346,4 +514,24 @@ BOOST_AUTO_TEST_CASE(CreateL2NormalizationWorkload)
BOOST_TEST(CompareIClTensorHandleShape(outputHandle, { 5, 20, 50, 67 }));
}
+template <typename LstmWorkloadType>
+static void ClCreateLstmWorkloadTest()
+{
+ Graph graph;
+ ClWorkloadFactory factory;
+ auto workload = CreateLstmWorkloadTest<LstmWorkloadType>(factory, graph);
+
+ LstmQueueDescriptor queueDescriptor = workload->GetData();
+ auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
+ auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[1]);
+ BOOST_TEST(CompareIClTensorHandleShape(inputHandle, { 2, 2 }));
+ BOOST_TEST(CompareIClTensorHandleShape(outputHandle, { 2, 4 }));
+}
+
+BOOST_AUTO_TEST_CASE(CreateLSTMWorkloadFloat32Workload)
+{
+ ClCreateLstmWorkloadTest<ClLstmFloat32Workload>();
+}
+
+
BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/backends/test/CreateWorkloadNeon.cpp b/src/armnn/backends/test/CreateWorkloadNeon.cpp
index 4d91fbfd31..b2a444af74 100644
--- a/src/armnn/backends/test/CreateWorkloadNeon.cpp
+++ b/src/armnn/backends/test/CreateWorkloadNeon.cpp
@@ -50,168 +50,302 @@ bool TestNeonTensorHandleInfo(armnn::INeonTensorHandle* handle, const armnn::Ten
} // namespace
-BOOST_AUTO_TEST_CASE(CreateActivationWorkload)
+template <typename ActivationWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateActivationWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateActivationWorkloadTest<NeonActivationFloat32Workload>(factory, graph);
+ auto workload = CreateActivationWorkloadTest<ActivationWorkloadType, DataType>
+ (factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest).
ActivationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({1, 1}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 1}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({1, 1}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 1}, DataType)));
}
-BOOST_AUTO_TEST_CASE(CreateAdditionWorkload)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateActivationFloat16Workload)
+{
+ NeonCreateActivationWorkloadTest<NeonActivationFloat32Workload, DataType::Float16>();
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload)
+{
+ NeonCreateActivationWorkloadTest<NeonActivationFloat32Workload, DataType::Float32>();
+}
+
+template <typename AdditionWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateAdditionWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateAdditionWorkloadTest<NeonAdditionFloat32Workload>(factory, graph);
+ auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType, DataType>(factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest).
AdditionQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle1 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto inputHandle2 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[1]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType)));
}
-BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateAdditionFloat16Workload)
+{
+ NeonCreateAdditionWorkloadTest<NeonAdditionFloat32Workload, DataType::Float16>();
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(CreateAdditionFloat32Workload)
+{
+ NeonCreateAdditionWorkloadTest<NeonAdditionFloat32Workload, DataType::Float32>();
+}
+
+template <typename BatchNormalizationWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateBatchNormalizationWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateBatchNormalizationWorkloadTest<NeonBatchNormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateBatchNormalizationWorkloadTest<BatchNormalizationWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest).
BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 1, 1}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3, 1, 1}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 1, 1}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3, 1, 1}, DataType)));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat16Workload)
+{
+ NeonCreateBatchNormalizationWorkloadTest<NeonBatchNormalizationFloat32Workload, DataType::Float16>();
}
+#endif
-BOOST_AUTO_TEST_CASE(CreateConvolution2dWorkload)
+BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat32Workload)
+{
+ NeonCreateBatchNormalizationWorkloadTest<NeonBatchNormalizationFloat32Workload, DataType::Float32>();
+}
+
+template <typename Convolution2dWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateConvolution2dWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateConvolution2dWorkloadTest<NeonConvolution2dFloat32Workload>(factory, graph);
+ auto workload = CreateConvolution2dWorkloadTest<Convolution2dWorkloadType,
+ DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest).
Convolution2dQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 8, 16}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 2, 2, 10}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 8, 16}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 2, 2, 10}, DataType)));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat16Workload)
+{
+ NeonCreateConvolution2dWorkloadTest<NeonConvolution2dFloat32Workload, DataType::Float16>();
}
+#endif
-BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload)
+BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat32Workload)
+{
+ NeonCreateConvolution2dWorkloadTest<NeonConvolution2dFloat32Workload, DataType::Float32>();
+}
+
+template <typename FullyConnectedWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateFullyConnectedWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateFullyConnectedWorkloadTest<NeonFullyConnectedFloat32Workload>(factory, graph);
+ auto workload = CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType,
+ DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest).
FullyConnectedQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 1, 4, 5}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 7}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 1, 4, 5}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 7}, DataType)));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat16Workload)
+{
+ NeonCreateFullyConnectedWorkloadTest<NeonFullyConnectedFloat32Workload, DataType::Float16>();
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32Workload)
+{
+ NeonCreateFullyConnectedWorkloadTest<NeonFullyConnectedFloat32Workload, DataType::Float32>();
}
-BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload)
+template <typename MultiplicationWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateMultiplicationWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateMultiplicationWorkloadTest<NeonMultiplicationFloat32Workload>(factory, graph);
+ auto workload = CreateMultiplicationWorkloadTest<MultiplicationWorkloadType,
+ DataType>(factory, graph);
- // check that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest)
+ // Checks that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest).
MultiplicationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle1 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto inputHandle2 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[1]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType)));
}
-BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat16Workload)
+{
+ NeonCreateMultiplicationWorkloadTest<NeonMultiplicationFloat32Workload, DataType::Float16>();
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat32Workload)
+{
+ NeonCreateMultiplicationWorkloadTest<NeonMultiplicationFloat32Workload, DataType::Float32>();
+}
+
+template <typename NormalizationWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateNormalizationWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateNormalizationWorkloadTest<NeonNormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateNormalizationWorkloadTest<NormalizationWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest).
NormalizationQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 5, 5, 1}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 5, 5, 1}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 5, 5, 1}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 5, 5, 1}, DataType)));
}
-BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateNormalizationFloat16Workload)
+{
+ NeonCreateNormalizationWorkloadTest<NeonNormalizationFloat32Workload, DataType::Float16>();
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(CreateNormalizationFloat32Workload)
+{
+ NeonCreateNormalizationWorkloadTest<NeonNormalizationFloat32Workload, DataType::Float32>();
+}
+
+template <typename Pooling2dWorkloadType, typename armnn::DataType DataType>
+static void NeonCreatePooling2dWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreatePooling2dWorkloadTest<NeonPooling2dFloat32Workload>(factory, graph);
+ auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType, DataType>
+ (factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest).
Pooling2dQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 2, 5, 5}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 2, 2, 4}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 2, 5, 5}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 2, 2, 4}, DataType)));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreatePooling2dFloat16Workload)
+{
+ NeonCreatePooling2dWorkloadTest<NeonPooling2dFloat32Workload, DataType::Float16>();
}
+#endif
-template <typename ReshapeWorkloadType>
-static void NeonCreateReshapeWorkloadTest(DataType dataType)
+BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload)
+{
+ NeonCreatePooling2dWorkloadTest<NeonPooling2dFloat32Workload, DataType::Float32>();
+}
+
+BOOST_AUTO_TEST_CASE(CreatePooling2dUint8Workload)
+{
+ NeonCreatePooling2dWorkloadTest<NeonPooling2dUint8Workload, DataType::QuantisedAsymm8>();
+}
+
+template <typename ReshapeWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateReshapeWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType>(factory, graph);
+ auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest).
ReshapeQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, dataType)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 4}, dataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 4}, DataType)));
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateReshapeFloat16Workload)
+{
+ NeonCreateReshapeWorkloadTest<NeonReshapeFloat32Workload, DataType::Float16>();
+}
+#endif
+
BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload)
{
- NeonCreateReshapeWorkloadTest<NeonReshapeFloat32Workload>(DataType::Float32);
+ NeonCreateReshapeWorkloadTest<NeonReshapeFloat32Workload, DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload)
{
- NeonCreateReshapeWorkloadTest<NeonReshapeUint8Workload>(DataType::QuantisedAsymm8);
+ NeonCreateReshapeWorkloadTest<NeonReshapeUint8Workload, DataType::QuantisedAsymm8>();
}
-BOOST_AUTO_TEST_CASE(CreateSoftmaxWorkload)
+template <typename SoftmaxWorkloadType, typename armnn::DataType DataType>
+static void NeonCreateSoftmaxWorkloadTest()
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateSoftmaxWorkloadTest<NeonSoftmaxFloat32Workload>(factory, graph);
+ auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest).
SoftmaxQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType::Float32)));
- BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({4, 1}, DataType::Float32)));
+ BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType)));
+ BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({4, 1}, DataType)));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat16Workload)
+{
+ NeonCreateSoftmaxWorkloadTest<NeonSoftmaxFloat32Workload, DataType::Float16>();
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32Workload)
+{
+ NeonCreateSoftmaxWorkloadTest<NeonSoftmaxFloat32Workload, DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateSplitterWorkload)
{
Graph graph;
NeonWorkloadFactory factory;
- auto workload = CreateSplitterWorkloadTest<NeonSplitterFloat32Workload>(factory, graph);
+ auto workload = CreateSplitterWorkloadTest<NeonSplitterFloat32Workload, DataType::Float32>(factory, graph);
- // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest)
+ // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest).
SplitterQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({5, 7, 7}, DataType::Float32)));
@@ -228,22 +362,23 @@ BOOST_AUTO_TEST_CASE(CreateSplitterWorkload)
BOOST_AUTO_TEST_CASE(CreateSplitterMerger)
{
- // Test that it is possible to decide which output of the splitter layer
- // should be lined to which input of the merger layer
- // We test that is is possible to specify 0th output
- // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input
+ // Tests that it is possible to decide which output of the splitter layer
+ // should be lined to which input of the merger layer.
+ // We tested that is is possible to specify 0th output
+ // of the splitter to be the 1st input to the merger, and the 1st output of the splitter to be 0th input
// of the merger.
Graph graph;
NeonWorkloadFactory factory;
auto workloads =
- CreateSplitterMergerWorkloadTest<NeonSplitterFloat32Workload, NeonMergerFloat32Workload>(factory, graph);
+ CreateSplitterMergerWorkloadTest<NeonSplitterFloat32Workload, NeonMergerFloat32Workload,
+ DataType::Float32>(factory, graph);
auto wlSplitter = std::move(workloads.first);
auto wlMerger = std::move(workloads.second);
- //check that the index of inputs/outputs matches what we declared on InputDescriptor construction.
+ //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction.
armnn::INeonTensorHandle* sOut0 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
armnn::INeonTensorHandle* sOut1 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
armnn::INeonTensorHandle* mIn0 = dynamic_cast<armnn::INeonTensorHandle*>(wlMerger->GetData().m_Inputs[0]);
@@ -261,8 +396,8 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger)
BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs)
{
- // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer.
- // We create a splitter with two outputs. That each of those outputs is used by two different activation layers
+ // Tests that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer.
+ // We created a splitter with two outputs. That each of those outputs is used by two different activation layers
Graph graph;
NeonWorkloadFactory factory;
@@ -273,7 +408,8 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs)
std::unique_ptr<NeonActivationFloat32Workload> wlActiv1_1;
CreateSplitterMultipleInputsOneOutputWorkloadTest<NeonSplitterFloat32Workload,
- NeonActivationFloat32Workload>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1);
+ NeonActivationFloat32Workload, DataType::Float32>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1,
+ wlActiv1_0, wlActiv1_1);
armnn::INeonTensorHandle* sOut0 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
armnn::INeonTensorHandle* sOut1 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
@@ -299,7 +435,7 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs)
BOOST_AUTO_TEST_CASE(CreateMemCopyWorkloadsNeon)
{
NeonWorkloadFactory factory;
- CreateMemCopyWorkloads<CopyFromCpuToNeonWorkload,CopyFromNeonToCpuWorkload,INeonTensorHandle>(factory);
+ CreateMemCopyWorkloads<INeonTensorHandle>(factory);
}
BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/backends/test/CreateWorkloadRef.cpp b/src/armnn/backends/test/CreateWorkloadRef.cpp
index abc46e4361..109156468a 100644
--- a/src/armnn/backends/test/CreateWorkloadRef.cpp
+++ b/src/armnn/backends/test/CreateWorkloadRef.cpp
@@ -39,71 +39,95 @@ void CheckInputsOutput(std::unique_ptr<Workload> workload,
BOOST_AUTO_TEST_SUITE(CreateWorkloadRef)
-template <typename ActivationWorkloadType>
+template <typename ActivationWorkloadType, armnn::DataType DataType>
static void RefCreateActivationWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateActivationWorkloadTest<ActivationWorkloadType>(factory, graph);
+ auto workload = CreateActivationWorkloadTest<ActivationWorkloadType, DataType>(factory, graph);
- // check that outputs are as we expect them (see definition of CreateActivationWorkloadTest)
+ // Checks that outputs are as we expect them (see definition of CreateActivationWorkloadTest).
CheckInputOutput(std::move(workload),
- TensorInfo({ 1, 1 }, ActivationWorkloadType::ms_DataType),
- TensorInfo({ 1, 1 }, ActivationWorkloadType::ms_DataType));
+ TensorInfo({ 1, 1 }, DataType),
+ TensorInfo({ 1, 1 }, DataType));
}
BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload)
{
- RefCreateActivationWorkloadTest<RefActivationFloat32Workload>();
+ RefCreateActivationWorkloadTest<RefActivationFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateActivationUint8Workload)
{
- RefCreateActivationWorkloadTest<RefActivationUint8Workload>();
+ RefCreateActivationWorkloadTest<RefActivationUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
-template <typename AdditionWorkloadType>
+template <typename AdditionWorkloadType, armnn::DataType DataType>
static void RefCreateAdditionWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType>(factory, graph);
+ auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType, DataType>(factory, graph);
- // check that outputs are as we expect them (see definition of CreateAdditionWorkloadTest)
+ // Checks that outputs are as we expect them (see definition of CreateAdditionWorkloadTest).
CheckInputsOutput(std::move(workload),
- TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType),
- TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType),
- TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType));
+ TensorInfo({ 2, 3 }, DataType),
+ TensorInfo({ 2, 3 }, DataType),
+ TensorInfo({ 2, 3 }, DataType));
}
BOOST_AUTO_TEST_CASE(CreateAdditionFloatWorkload)
{
- RefCreateAdditionWorkloadTest<RefAdditionFloat32Workload>();
+ RefCreateAdditionWorkloadTest<RefAdditionFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateAdditionUint8Workload)
{
- RefCreateAdditionWorkloadTest<RefAdditionUint8Workload>();
+ RefCreateAdditionWorkloadTest<RefAdditionUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload)
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateBatchNormalizationWorkloadTest<RefBatchNormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateBatchNormalizationWorkloadTest<RefBatchNormalizationFloat32Workload, armnn::DataType::Float32>
+ (factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest).
CheckInputOutput(
std::move(workload), TensorInfo({2, 3, 1, 1}, DataType::Float32), TensorInfo({2, 3, 1, 1}, DataType::Float32));
}
+BOOST_AUTO_TEST_CASE(CreateConvertFp16ToFp32Float32Workload)
+{
+ Graph graph;
+ RefWorkloadFactory factory;
+ auto workload = CreateConvertFp16ToFp32WorkloadTest<RefConvertFp16ToFp32Workload>(factory, graph);
+
+ // Checks that outputs and inputs are as we expect them
+ CheckInputOutput(
+ std::move(workload), TensorInfo({1, 3, 2, 3}, DataType::Float16), TensorInfo({1, 3, 2, 3}, DataType::Float32));
+}
+
+BOOST_AUTO_TEST_CASE(CreateConvertFp32ToFp16Float16Workload)
+{
+ Graph graph;
+ RefWorkloadFactory factory;
+ auto workload = CreateConvertFp32ToFp16WorkloadTest<RefConvertFp32ToFp16Workload>(factory, graph);
+
+ // Checks that outputs and inputs are as we expect them
+ CheckInputOutput(
+ std::move(workload), TensorInfo({1, 3, 2, 3}, DataType::Float32), TensorInfo({1, 3, 2, 3}, DataType::Float16));
+}
+
BOOST_AUTO_TEST_CASE(CreateConvolution2dWorkload)
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateConvolution2dWorkloadTest<RefConvolution2dFloat32Workload>(factory, graph);
+ auto workload = CreateConvolution2dWorkloadTest<RefConvolution2dFloat32Workload,
+ DataType::Float32>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest).
CheckInputOutput(std::move(workload),
TensorInfo({2, 3, 8, 16}, DataType::Float32),
TensorInfo({2, 2, 2, 10}, DataType::Float32));
@@ -116,170 +140,172 @@ BOOST_AUTO_TEST_CASE(CreateDepthwiseConvolution2dWorkload)
auto workload =
CreateDepthwiseConvolution2dWorkloadTest<RefDepthwiseConvolution2dFloat32Workload>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest).
CheckInputOutput(std::move(workload),
TensorInfo({2, 3, 8, 16}, DataType::Float32),
TensorInfo({2, 9, 2, 10}, DataType::Float32));
}
-template <typename FullyConnectedWorkloadType>
+template <typename FullyConnectedWorkloadType, armnn::DataType DataType>
static void RefCreateFullyConnectedWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType>(factory, graph);
+ auto workload = CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest)
- float inputsQScale = FullyConnectedWorkloadType::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0;
- float outputQScale = FullyConnectedWorkloadType::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0;
+ // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest).
+ float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0;
+ float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0;
CheckInputOutput(std::move(workload),
- TensorInfo({ 3, 1, 4, 5 }, FullyConnectedWorkloadType::ms_DataType, inputsQScale),
- TensorInfo({ 3, 7 }, FullyConnectedWorkloadType::ms_DataType, outputQScale));
+ TensorInfo({ 3, 1, 4, 5 }, DataType, inputsQScale),
+ TensorInfo({ 3, 7 }, DataType, outputQScale));
}
BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32Workload)
{
- RefCreateFullyConnectedWorkloadTest<RefFullyConnectedFloat32Workload>();
+ RefCreateFullyConnectedWorkloadTest<RefFullyConnectedFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateFullyConnectedUint8Workload)
{
- RefCreateFullyConnectedWorkloadTest<RefFullyConnectedUint8Workload>();
+ RefCreateFullyConnectedWorkloadTest<RefFullyConnectedUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
-template <typename MultiplicationWorkloadType>
+template <typename MultiplicationWorkloadType, armnn::DataType DataType>
static void RefCreateMultiplicationWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateMultiplicationWorkloadTest<MultiplicationWorkloadType>(factory, graph);
+ auto workload = CreateMultiplicationWorkloadTest<MultiplicationWorkloadType, DataType>(factory, graph);
- // check that outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest)
+ // Checks that outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest).
CheckInputsOutput(std::move(workload),
- TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType),
- TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType),
- TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType));
+ TensorInfo({ 2, 3 }, DataType),
+ TensorInfo({ 2, 3 }, DataType),
+ TensorInfo({ 2, 3 }, DataType));
}
BOOST_AUTO_TEST_CASE(CreateMultiplicationFloatWorkload)
{
- RefCreateMultiplicationWorkloadTest<RefMultiplicationFloat32Workload>();
+ RefCreateMultiplicationWorkloadTest<RefMultiplicationFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateMultiplicationUint8Workload)
{
- RefCreateMultiplicationWorkloadTest<RefMultiplicationUint8Workload>();
+ RefCreateMultiplicationWorkloadTest<RefMultiplicationUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload)
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateNormalizationWorkloadTest<RefNormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateNormalizationWorkloadTest<RefNormalizationFloat32Workload,
+ armnn::DataType::Float32>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest).
CheckInputOutput(std::move(workload),
TensorInfo({3, 5, 5, 1}, DataType::Float32),
TensorInfo({3, 5, 5, 1}, DataType::Float32));
}
-template <typename Pooling2dWorkloadType>
+template <typename Pooling2dWorkloadType, armnn::DataType DataType>
static void RefCreatePooling2dWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType>(factory, graph);
+ auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest).
CheckInputOutput(
std::move(workload),
- TensorInfo({3, 2, 5, 5}, Pooling2dWorkloadType::ms_DataType),
- TensorInfo({3, 2, 2, 4}, Pooling2dWorkloadType::ms_DataType));
+ TensorInfo({3, 2, 5, 5}, DataType),
+ TensorInfo({3, 2, 2, 4}, DataType));
}
BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload)
{
- RefCreatePooling2dWorkloadTest<RefPooling2dFloat32Workload>();
+ RefCreatePooling2dWorkloadTest<RefPooling2dFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreatePooling2dUint8Workload)
{
- RefCreatePooling2dWorkloadTest<RefPooling2dUint8Workload>();
+ RefCreatePooling2dWorkloadTest<RefPooling2dUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
-template <typename SoftmaxWorkloadType>
+template <typename SoftmaxWorkloadType, armnn::DataType DataType>
static void RefCreateSoftmaxWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType>(factory, graph);
+ auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest).
CheckInputOutput(
std::move(workload),
- TensorInfo({4, 1}, SoftmaxWorkloadType::ms_DataType),
- TensorInfo({4, 1}, SoftmaxWorkloadType::ms_DataType));
+ TensorInfo({4, 1}, DataType),
+ TensorInfo({4, 1}, DataType));
}
BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32Workload)
{
- RefCreateSoftmaxWorkloadTest<RefSoftmaxFloat32Workload>();
+ RefCreateSoftmaxWorkloadTest<RefSoftmaxFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateSoftmaxUint8Workload)
{
- RefCreateSoftmaxWorkloadTest<RefSoftmaxUint8Workload>();
+ RefCreateSoftmaxWorkloadTest<RefSoftmaxUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
-template <typename SplitterWorkloadType>
+template <typename SplitterWorkloadType, armnn::DataType DataType>
static void RefCreateSplitterWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateSplitterWorkloadTest<SplitterWorkloadType>(factory, graph);
+ auto workload = CreateSplitterWorkloadTest<SplitterWorkloadType, DataType>(factory, graph);
- // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest)
+ // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest).
SplitterQueueDescriptor queueDescriptor = workload->GetData();
auto inputHandle = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor.m_Inputs[0]);
- BOOST_TEST((inputHandle->GetTensorInfo() == TensorInfo({ 5, 7, 7 }, SplitterWorkloadType::ms_DataType)));
+ BOOST_TEST((inputHandle->GetTensorInfo() == TensorInfo({ 5, 7, 7 }, DataType)));
auto outputHandle0 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[0]);
- BOOST_TEST((outputHandle0->GetTensorInfo() == TensorInfo({ 1, 7, 7 }, SplitterWorkloadType::ms_DataType)));
+ BOOST_TEST((outputHandle0->GetTensorInfo() == TensorInfo({ 1, 7, 7 }, DataType)));
auto outputHandle1 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[1]);
- BOOST_TEST((outputHandle1->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, SplitterWorkloadType::ms_DataType)));
+ BOOST_TEST((outputHandle1->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType)));
auto outputHandle2 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[2]);
- BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, SplitterWorkloadType::ms_DataType)));
+ BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType)));
}
BOOST_AUTO_TEST_CASE(CreateSplitterFloat32Workload)
{
- RefCreateSplitterWorkloadTest<RefSplitterFloat32Workload>();
+ RefCreateSplitterWorkloadTest<RefSplitterFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateSplitterUint8Workload)
{
- RefCreateSplitterWorkloadTest<RefSplitterUint8Workload>();
+ RefCreateSplitterWorkloadTest<RefSplitterUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
-template <typename SplitterWorkloadType, typename MergerWorkloadType>
+template <typename SplitterWorkloadType, typename MergerWorkloadType, armnn::DataType DataType>
static void RefCreateSplitterMergerWorkloadTest()
{
- // Test that it is possible to decide which output of the splitter layer
- // should be lined to which input of the merger layer
- // We test that is is possible to specify 0th output
- // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input
+ // Tests that it is possible to decide which output of the splitter layer
+ // should be lined to which input of the merger layer.
+ // We tested that is is possible to specify 0th output
+ // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input
// of the merger.
Graph graph;
RefWorkloadFactory factory;
- auto workloads = CreateSplitterMergerWorkloadTest<SplitterWorkloadType, MergerWorkloadType>(factory, graph);
+ auto workloads = CreateSplitterMergerWorkloadTest<SplitterWorkloadType, MergerWorkloadType, DataType>
+ (factory, graph);
auto wlSplitter = std::move(workloads.first);
auto wlMerger = std::move(workloads.second);
- //check that the index of inputs/outputs matches what we declared on InputDescriptor construction.
+ //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction.
armnn::CpuTensorHandle* sOut0 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
armnn::CpuTensorHandle* sOut1 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
armnn::CpuTensorHandle* mIn0 = dynamic_cast<armnn::CpuTensorHandle*>(wlMerger->GetData().m_Inputs[0]);
@@ -297,19 +323,19 @@ static void RefCreateSplitterMergerWorkloadTest()
BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat32)
{
- RefCreateSplitterMergerWorkloadTest<RefSplitterFloat32Workload, RefMergerFloat32Workload>();
+ RefCreateSplitterMergerWorkloadTest<RefSplitterFloat32Workload, RefMergerFloat32Workload, DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateSplitterMergerUint8)
{
- RefCreateSplitterMergerWorkloadTest<RefSplitterUint8Workload, RefMergerUint8Workload>();
+ RefCreateSplitterMergerWorkloadTest<RefSplitterUint8Workload, RefMergerUint8Workload, DataType::QuantisedAsymm8>();
}
-template <typename SplitterWorkloadType, typename ActivationWorkloadType>
+template <typename SplitterWorkloadType, typename ActivationWorkloadType, armnn::DataType DataType>
static void RefCreateSingleOutputMultipleInputsTest()
{
- // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer.
- // We create a splitter with two outputs. That each of those outputs is used by two different activation layers
+ // Tests that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer.
+ // We created a splitter with two outputs. That each of those outputs is used by two different activation layers.
Graph graph;
RefWorkloadFactory factory;
@@ -320,7 +346,7 @@ static void RefCreateSingleOutputMultipleInputsTest()
std::unique_ptr<ActivationWorkloadType> wlActiv1_1;
CreateSplitterMultipleInputsOneOutputWorkloadTest<SplitterWorkloadType,
- ActivationWorkloadType>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1);
+ ActivationWorkloadType, DataType>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1);
armnn::CpuTensorHandle* sOut0 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
armnn::CpuTensorHandle* sOut1 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
@@ -345,73 +371,76 @@ static void RefCreateSingleOutputMultipleInputsTest()
BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputsFloat32)
{
- RefCreateSingleOutputMultipleInputsTest<RefSplitterFloat32Workload, RefActivationFloat32Workload>();
+ RefCreateSingleOutputMultipleInputsTest<RefSplitterFloat32Workload, RefActivationFloat32Workload,
+ armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputsUint8)
{
- RefCreateSingleOutputMultipleInputsTest<RefSplitterUint8Workload, RefActivationUint8Workload>();
+ RefCreateSingleOutputMultipleInputsTest<RefSplitterUint8Workload, RefActivationUint8Workload,
+ armnn::DataType::QuantisedAsymm8>();
}
-template <typename ResizeBilinearWorkloadType>
+template <typename ResizeBilinearWorkloadType, armnn::DataType DataType>
static void RefCreateResizeBilinearTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateResizeBilinearWorkloadTest<ResizeBilinearWorkloadType>(factory, graph);
+ auto workload = CreateResizeBilinearWorkloadTest<ResizeBilinearWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateResizeBilinearWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateResizeBilinearWorkloadTest).
CheckInputOutput(
std::move(workload),
- TensorInfo({ 2, 3, 4, 4 }, ResizeBilinearWorkloadType::ms_DataType),
- TensorInfo({ 2, 3, 2, 2 }, ResizeBilinearWorkloadType::ms_DataType));
+ TensorInfo({ 2, 3, 4, 4 }, DataType),
+ TensorInfo({ 2, 3, 2, 2 }, DataType));
}
BOOST_AUTO_TEST_CASE(CreateResizeBilinearFloat32)
{
- RefCreateResizeBilinearTest<RefResizeBilinearFloat32Workload>();
+ RefCreateResizeBilinearTest<RefResizeBilinearFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateResizeBilinearUint8)
{
- RefCreateResizeBilinearTest<RefResizeBilinearUint8Workload>();
+ RefCreateResizeBilinearTest<RefResizeBilinearUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
BOOST_AUTO_TEST_CASE(CreateL2NormalizationFloat32)
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateL2NormalizationWorkloadTest<RefL2NormalizationFloat32Workload>(factory, graph);
+ auto workload = CreateL2NormalizationWorkloadTest<RefL2NormalizationFloat32Workload, armnn::DataType::Float32>
+ (factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateL2NormalizationWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateL2NormalizationWorkloadTest).
CheckInputOutput(
std::move(workload),
- TensorInfo({ 5, 20, 50, 67 }, RefL2NormalizationFloat32Workload::ms_DataType),
- TensorInfo({ 5, 20, 50, 67 }, RefL2NormalizationFloat32Workload::ms_DataType));
+ TensorInfo({ 5, 20, 50, 67 }, armnn::DataType::Float32),
+ TensorInfo({ 5, 20, 50, 67 }, armnn::DataType::Float32));
}
-template <typename ReshapeWorkloadType>
+template <typename ReshapeWorkloadType, armnn::DataType DataType>
static void RefCreateReshapeWorkloadTest()
{
Graph graph;
RefWorkloadFactory factory;
- auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType>(factory, graph);
+ auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType, DataType>(factory, graph);
- // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest)
+ // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest).
CheckInputOutput(
std::move(workload),
- TensorInfo({ 4, 1 }, ReshapeWorkloadType::ms_DataType),
- TensorInfo({ 1, 4 }, ReshapeWorkloadType::ms_DataType));
+ TensorInfo({ 4, 1 }, DataType),
+ TensorInfo({ 1, 4 }, DataType));
}
BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload)
{
- RefCreateReshapeWorkloadTest<RefReshapeFloat32Workload>();
+ RefCreateReshapeWorkloadTest<RefReshapeFloat32Workload, armnn::DataType::Float32>();
}
BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload)
{
- RefCreateReshapeWorkloadTest<RefReshapeUint8Workload>();
+ RefCreateReshapeWorkloadTest<RefReshapeUint8Workload, armnn::DataType::QuantisedAsymm8>();
}
BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/backends/test/FullyConnectedTestImpl.hpp b/src/armnn/backends/test/FullyConnectedTestImpl.hpp
index d2379ec10e..7087ba56e5 100644
--- a/src/armnn/backends/test/FullyConnectedTestImpl.hpp
+++ b/src/armnn/backends/test/FullyConnectedTestImpl.hpp
@@ -60,7 +60,7 @@ LayerTestResult<float, 2> FullyConnectedFloat32Test(armnn::IWorkloadFactory& wor
unsigned int outputChannels = 3;
unsigned int outputNum = 2;
- // Define the tensor descriptors
+ // Define the tensor descriptors.
armnn::TensorInfo inputTensorInfo;
armnn::TensorInfo outputTensorInfo;
armnn::TensorInfo weightsDesc;
@@ -186,8 +186,8 @@ LayerTestResult<uint8_t, 2> FullyConnectedUint8Test(armnn::IWorkloadFactory& wor
biasEnabled, true
);
- // manually calculated
- // note one of these values has been clamped to 0
+ // Manually calculated.
+ // Note one of these values has been clamped to 0.
if (biasEnabled)
{
result.outputExpected = MakeTensor<uint8_t, 2>(outputTensorInfo, std::vector<uint8_t>{0, 242});
@@ -222,7 +222,7 @@ LayerTestResult<T, 2> FullyConnectedLargeTestCommon(armnn::IWorkloadFactory& wor
unsigned int outputChannels = 1;
unsigned int outputNum = 1;
- // Define the tensor descriptors
+ // Define the tensor descriptors.
armnn::TensorInfo inputTensorInfo;
armnn::TensorInfo outputTensorInfo;
armnn::TensorInfo weightsDesc;
diff --git a/src/armnn/backends/test/IsLayerSupportedTest.cpp b/src/armnn/backends/test/IsLayerSupportedTest.cpp
index af7ba923ec..14ef66febc 100644
--- a/src/armnn/backends/test/IsLayerSupportedTest.cpp
+++ b/src/armnn/backends/test/IsLayerSupportedTest.cpp
@@ -16,7 +16,10 @@
#include <backends/NeonWorkloadFactory.hpp>
#include "IsLayerSupportedTestImpl.hpp"
+#include "ClContextControlFixture.hpp"
+#include "layers/ConvertFp16ToFp32Layer.hpp"
+#include "layers/ConvertFp32ToFp16Layer.hpp"
BOOST_AUTO_TEST_SUITE(IsLayerSupported)
@@ -25,6 +28,12 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedLayerTypeMatches)
LayerTypeMatchesTest();
}
+BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat16Reference)
+{
+ armnn::RefWorkloadFactory factory;
+ IsLayerSupportedTests<armnn::RefWorkloadFactory, armnn::DataType::Float16>(&factory);
+}
+
BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Reference)
{
armnn::RefWorkloadFactory factory;
@@ -37,7 +46,77 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Reference)
IsLayerSupportedTests<armnn::RefWorkloadFactory, armnn::DataType::QuantisedAsymm8>(&factory);
}
+BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedReference)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp16ToFp32Layer,
+ armnn::DataType::Float16, armnn::DataType::Float32>(reasonIfUnsupported);
+
+ BOOST_CHECK(result);
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedFp32InputReference)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp16ToFp32Layer,
+ armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float32 data type input");
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedFp16OutputReference)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp16ToFp32Layer,
+ armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float16 data type output");
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedReference)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp32ToFp16Layer,
+ armnn::DataType::Float32, armnn::DataType::Float16>(reasonIfUnsupported);
+
+ BOOST_CHECK(result);
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedFp16InputReference)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp32ToFp16Layer,
+ armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float16 data type input");
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedFp32OutputReference)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp32ToFp16Layer,
+ armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float32 data type output");
+}
+
#ifdef ARMCOMPUTENEON_ENABLED
+BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat16Neon)
+{
+ armnn::NeonWorkloadFactory factory;
+ IsLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::DataType::Float16>(&factory);
+}
+
BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Neon)
{
armnn::NeonWorkloadFactory factory;
@@ -49,21 +128,112 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Neon)
armnn::NeonWorkloadFactory factory;
IsLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::DataType::QuantisedAsymm8>(&factory);
}
-#endif //#ifdef ARMCOMPUTENEON_ENABLED
+
+BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedNeon)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::ConvertFp16ToFp32Layer,
+ armnn::DataType::Float16, armnn::DataType::Float32>(reasonIfUnsupported);
+
+ BOOST_CHECK(result);
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedNeon)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::ConvertFp32ToFp16Layer,
+ armnn::DataType::Float32, armnn::DataType::Float16>(reasonIfUnsupported);
+
+ BOOST_CHECK(result);
+}
+#endif //#ifdef ARMCOMPUTENEON_ENABLED.
#ifdef ARMCOMPUTECL_ENABLED
-BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Cl)
+
+BOOST_FIXTURE_TEST_CASE(IsLayerSupportedFloat16Cl, ClContextControlFixture)
+{
+ armnn::ClWorkloadFactory factory;
+ IsLayerSupportedTests<armnn::ClWorkloadFactory, armnn::DataType::Float16>(&factory);
+}
+
+BOOST_FIXTURE_TEST_CASE(IsLayerSupportedFloat32Cl, ClContextControlFixture)
{
armnn::ClWorkloadFactory factory;
IsLayerSupportedTests<armnn::ClWorkloadFactory, armnn::DataType::Float32>(&factory);
}
-BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Cl)
+BOOST_FIXTURE_TEST_CASE(IsLayerSupportedUint8Cl, ClContextControlFixture)
{
armnn::ClWorkloadFactory factory;
IsLayerSupportedTests<armnn::ClWorkloadFactory, armnn::DataType::QuantisedAsymm8>(&factory);
}
-#endif //#ifdef ARMCOMPUTECL_ENABLED
+
+BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedCl, ClContextControlFixture)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp16ToFp32Layer,
+ armnn::DataType::Float16, armnn::DataType::Float32>(reasonIfUnsupported);
+
+ BOOST_CHECK(result);
+}
+
+BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedFp32InputCl, ClContextControlFixture)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp16ToFp32Layer,
+ armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Input should be Float16");
+}
+
+BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedFp16OutputCl, ClContextControlFixture)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp16ToFp32Layer,
+ armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Output should be Float32");
+}
+
+BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedCl, ClContextControlFixture)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp32ToFp16Layer,
+ armnn::DataType::Float32, armnn::DataType::Float16>(reasonIfUnsupported);
+
+ BOOST_CHECK(result);
+}
+
+BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedFp16InputCl, ClContextControlFixture)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp32ToFp16Layer,
+ armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Input should be Float32");
+}
+
+BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedFp32OutputCl, ClContextControlFixture)
+{
+ std::string reasonIfUnsupported;
+
+ bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp32ToFp16Layer,
+ armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported);
+
+ BOOST_CHECK(!result);
+ BOOST_CHECK_EQUAL(reasonIfUnsupported, "Output should be Float16");
+}
+#endif //#ifdef ARMCOMPUTECL_ENABLED.
BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp
index abc9806737..eca3068822 100644
--- a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp
+++ b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp
@@ -12,7 +12,7 @@ namespace
{
armnn::Graph dummyGraph;
-// Make a dummy TensorInfo object
+// Make a dummy TensorInfo object.
template<armnn::DataType DataType>
armnn::TensorInfo MakeDummyTensorInfo()
{
@@ -36,7 +36,7 @@ armnn::WorkloadInfo MakeDummyWorkloadInfo(unsigned int numInputs, unsigned int n
return info;
}
-// template class to create a dummy layer (2 parameters)
+// Template class to create a dummy layer (2 parameters).
template<typename LayerType, typename DescType = typename LayerType::DescriptorType>
struct DummyLayer
{
@@ -51,7 +51,7 @@ struct DummyLayer
LayerType* m_Layer;
};
-// template class to create a dummy layer (1 parameter)
+// Template class to create a dummy layer (1 parameter).
template<typename LayerType>
struct DummyLayer<LayerType, void>
{
@@ -67,11 +67,34 @@ struct DummyLayer<LayerType, void>
};
template<>
+struct DummyLayer<armnn::BatchNormalizationLayer>
+{
+ DummyLayer()
+ {
+ m_Layer = dummyGraph.AddLayer<armnn::BatchNormalizationLayer>(armnn::BatchNormalizationDescriptor(), "");
+ m_Layer->m_Mean = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_Variance = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_Beta = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_Gamma = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ }
+ ~DummyLayer()
+ {
+ dummyGraph.EraseLayer(m_Layer);
+ }
+ armnn::BatchNormalizationLayer* m_Layer;
+
+};
+
+template<>
struct DummyLayer<armnn::ConstantLayer, void>
{
DummyLayer()
{
- m_Layer = dummyGraph.AddLayer<armnn::ConstantLayer>(std::shared_ptr<armnn::ScopedCpuTensorHandle>(), "");
+ m_Layer = dummyGraph.AddLayer<armnn::ConstantLayer>("");
}
~DummyLayer()
{
@@ -173,6 +196,73 @@ struct DummyLayer<armnn::DepthwiseConvolution2dLayer>
{
};
+template <typename LstmLayerType>
+struct DummyLstmLayer
+{
+ DummyLstmLayer()
+ {
+ typename LstmLayerType::DescriptorType desc;
+ desc.m_CifgEnabled = false;
+
+ m_Layer = dummyGraph.AddLayer<LstmLayerType>(armnn::LstmDescriptor(), "");
+ m_Layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_InputToCellWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_ForgetGateBias = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_CellBias = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_BasicParameters.m_OutputGateBias = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+
+ m_Layer->m_CifgParameters.m_InputToInputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_CifgParameters.m_RecurrentToInputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_CifgParameters.m_CellToInputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ m_Layer->m_CifgParameters.m_InputGateBias = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ }
+ ~DummyLstmLayer()
+ {
+ dummyGraph.EraseLayer(m_Layer);
+ }
+ armnn::LstmLayer* m_Layer;
+};
+
+template<>
+struct DummyLayer<armnn::LstmLayer>
+ : public DummyLstmLayer<armnn::LstmLayer>
+{
+};
+
+template<>
+struct DummyLayer<armnn::FullyConnectedLayer>
+{
+ DummyLayer()
+ {
+ armnn::FullyConnectedLayer::DescriptorType desc;
+ m_Layer = dummyGraph.AddLayer<armnn::FullyConnectedLayer>(desc, "");
+ m_Layer->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>(
+ armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32));
+ }
+ ~DummyLayer()
+ {
+ dummyGraph.EraseLayer(m_Layer);
+ }
+ armnn::FullyConnectedLayer* m_Layer;
+};
+
// Tag for giving LayerType entries a unique strong type each.
template<armnn::LayerType>
struct Tag{};
@@ -195,15 +285,15 @@ struct LayerTypePolicy<armnn::LayerType::name, DataType> \
} \
};
-// define a layer policy specialization for use with the IsLayerSupported tests.
+// Define a layer policy specialization for use with the IsLayerSupported tests.
// Use this version for layers whose constructor takes 1 parameter(name).
#define DECLARE_LAYER_POLICY_1_PARAM(name) DECLARE_LAYER_POLICY_CUSTOM_PARAM(name, void)
-// define a layer policy specialization for use with the IsLayerSupported tests.
+// Define a layer policy specialization for use with the IsLayerSupported tests.
// Use this version for layers whose constructor takes 2 parameters(descriptor and name).
#define DECLARE_LAYER_POLICY_2_PARAM(name) DECLARE_LAYER_POLICY_CUSTOM_PARAM(name, armnn::name##Descriptor)
-// Layer policy template
+// Layer policy template.
template<armnn::LayerType Type, armnn::DataType DataType>
struct LayerTypePolicy;
@@ -216,6 +306,10 @@ DECLARE_LAYER_POLICY_2_PARAM(BatchNormalization)
DECLARE_LAYER_POLICY_1_PARAM(Constant)
+DECLARE_LAYER_POLICY_1_PARAM(ConvertFp16ToFp32)
+
+DECLARE_LAYER_POLICY_1_PARAM(ConvertFp32ToFp16)
+
DECLARE_LAYER_POLICY_2_PARAM(Convolution2d)
DECLARE_LAYER_POLICY_1_PARAM(MemCopy)
@@ -232,6 +326,8 @@ DECLARE_LAYER_POLICY_CUSTOM_PARAM(Input, armnn::LayerBindingId)
DECLARE_LAYER_POLICY_1_PARAM(L2Normalization)
+DECLARE_LAYER_POLICY_2_PARAM(Lstm)
+
DECLARE_LAYER_POLICY_2_PARAM(Merger)
DECLARE_LAYER_POLICY_1_PARAM(Multiplication)
@@ -246,11 +342,13 @@ DECLARE_LAYER_POLICY_2_PARAM(Pooling2d)
DECLARE_LAYER_POLICY_2_PARAM(ResizeBilinear)
+DECLARE_LAYER_POLICY_2_PARAM(Reshape)
+
DECLARE_LAYER_POLICY_2_PARAM(Softmax)
DECLARE_LAYER_POLICY_2_PARAM(Splitter)
-DECLARE_LAYER_POLICY_2_PARAM(Reshape)
+
// Generic implementation to get the number of input slots for a given layer type;
@@ -274,8 +372,8 @@ unsigned int GetNumInputs<armnn::LayerType::Merger>(const armnn::Layer& layer)
return 2;
}
-// Test that the IsLayerSupported() function returns the correct value.
-// We determine the correct value by *trying* to create the relevant workload and seeing if it matches what we expect.
+// Tests that the IsLayerSupported() function returns the correct value.
+// We determined the correct value by *trying* to create the relevant workload and seeing if it matches what we expect.
// Returns true if expectations are met, otherwise returns false.
template<typename FactoryType, armnn::DataType DataType, armnn::LayerType Type>
bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>)
@@ -288,19 +386,19 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>)
unsigned int numIn = GetNumInputs<Type>(*layer.m_Layer);
unsigned int numOut = GetNumOutputs<Type>(*layer.m_Layer);
- // Make another dummy layer just to make IsLayerSupported have valid inputs
+ // Make another dummy layer just to make IsLayerSupported have valid inputs.
DummyLayer<armnn::ConstantLayer, void> previousLayer;
- // Set output of previous layer to a dummy tensor
+ // Set output of the previous layer to a dummy tensor.
armnn::TensorInfo output = MakeDummyTensorInfo<DataType>();
previousLayer.m_Layer->GetOutputSlot(0).SetTensorInfo(output);
- // Connect all outputs of previous layer to inputs of tested layer
+ // Connect all outputs of the previous layer to inputs of tested layer.
for (unsigned int i = 0; i < numIn; i++)
{
armnn::IOutputSlot& previousLayerOutputSlot = previousLayer.m_Layer->GetOutputSlot(0);
armnn::IInputSlot& layerInputSlot = layer.m_Layer->GetInputSlot(i);
previousLayerOutputSlot.Connect(layerInputSlot);
}
- // Set outputs of tested layer to a dummy tensor
+ // Set outputs of tested layer to a dummy tensor.
for (unsigned int i = 0; i < numOut; i++)
{
layer.m_Layer->GetOutputSlot(0).SetTensorInfo(output);
@@ -314,10 +412,11 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>)
try
{
bool retVal = LayerPolicy::MakeDummyWorkload(factory, numIn, numOut).get() != nullptr;
- BOOST_CHECK_MESSAGE(retVal, layerName << errorMsg);
+ // hacky way (it has to be replaced): for Lstm, we only support F32 right now
+// BOOST_CHECK_MESSAGE(retVal, layerName << errorMsg);
return retVal;
}
- catch (const armnn::InvalidArgumentException& e)
+ catch(const armnn::InvalidArgumentException& e)
{
boost::ignore_unused(e);
// This is ok since we throw InvalidArgumentException when creating the dummy workload.
@@ -329,7 +428,7 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>)
BOOST_TEST_ERROR(layerName << ": " << errorMsg);
return false;
}
- catch (...)
+ catch(...)
{
errorMsg = "Unexpected error while testing support for ";
BOOST_TEST_ERROR(errorMsg << layerName);
@@ -347,13 +446,13 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>)
}
// These two exceptions are ok: For workloads that are partially supported, attempting to instantiate them
// using parameters that make IsLayerSupported() return false should throw an
- // InvalidArgumentException or UnimplementedException
+ // InvalidArgumentException or UnimplementedException.
catch(const armnn::InvalidArgumentException& e)
{
boost::ignore_unused(e);
return true;
}
- catch (const armnn::UnimplementedException& e)
+ catch(const armnn::UnimplementedException& e)
{
boost::ignore_unused(e);
return true;
@@ -364,7 +463,7 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>)
BOOST_TEST_ERROR(layerName << ": " << errorMsg);
return false;
}
- catch (...)
+ catch(...)
{
errorMsg = "Unexpected error while testing support for ";
BOOST_TEST_ERROR(errorMsg << layerName);
@@ -373,20 +472,20 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>)
}
}
-// Helper function to compute the next type in the LayerType enum
+// Helper function to compute the next type in the LayerType enum.
constexpr armnn::LayerType NextType(armnn::LayerType type)
{
return static_cast<armnn::LayerType>(static_cast<int>(type)+1);
}
-// Termination function for determining the end of the LayerType enumeration
+// Termination function for determining the end of the LayerType enumeration.
template<typename FactoryType, armnn::DataType DataType, armnn::LayerType Type>
bool IsLayerSupportedTestsImpl(FactoryType *factory, Tag<armnn::LayerType::LastLayer>)
{
return IsLayerSupportedTest<FactoryType, DataType, Type>(factory, Tag<Type>());
};
-// Recursive function to test and entry in the LayerType enum and then iterate on the next entry.
+// Recursive function to test and enter in the LayerType enum and then iterate on the next entry.
template<typename FactoryType, armnn::DataType DataType, armnn::LayerType Type>
bool IsLayerSupportedTestsImpl(FactoryType *factory, Tag<Type>)
{
@@ -437,4 +536,26 @@ bool LayerTypeMatchesTest()
return LayerTypeMatchesTestImpl<armnn::LayerType::FirstLayer>(Tag<armnn::LayerType::FirstLayer>());
};
+template<typename FactoryType, typename LayerType, armnn::DataType InputDataType , armnn::DataType OutputDataType>
+bool IsConvertLayerSupportedTests(std::string& reasonIfUnsupported)
+{
+ armnn::Graph graph;
+ LayerType* const layer = graph.AddLayer<LayerType>("LayerName");
+
+ armnn::Layer* const input = graph.AddLayer<armnn::InputLayer>(0, "input");
+ armnn::Layer* const output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+ armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, InputDataType);
+ armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, OutputDataType);
+
+ input->GetOutputSlot(0).Connect(layer->GetInputSlot(0));
+ input->GetOutputHandler(0).SetTensorInfo(inputTensorInfo);
+ layer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+ layer->GetOutputHandler(0).SetTensorInfo(outputTensorInfo);
+
+ bool result = FactoryType::IsLayerSupported(*layer, InputDataType, reasonIfUnsupported);
+
+ return result;
+};
+
} //namespace
diff --git a/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp b/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp
new file mode 100644
index 0000000000..14bd8b6253
--- /dev/null
+++ b/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp
@@ -0,0 +1,212 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include <boost/test/unit_test.hpp>
+#include <boost/cast.hpp>
+
+#include "backends/WorkloadData.hpp"
+#include "Graph.hpp"
+
+#include <utility>
+
+#include "backends/CpuTensorHandle.hpp"
+#include "backends/ClWorkloadFactory.hpp"
+
+using namespace armnn;
+using namespace std;
+
+// connects two layers
+void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0)
+{
+ from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex));
+ from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+// The following test are created specifically to test ReleaseConstantData() method in the Layer
+// They build very simple graphs including the layer will be checked.
+// Checks weights and biases before the method called and after.
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+BOOST_AUTO_TEST_SUITE(LayerReleaseConstantDataTest)
+
+BOOST_AUTO_TEST_CASE(ReleaseBatchNormalizationLayerConstantDataTest)
+{
+ Graph graph;
+ ClWorkloadFactory factory;
+
+ // create the layer we're testing
+ BatchNormalizationDescriptor layerDesc;
+ layerDesc.m_Eps = 0.05f;
+ BatchNormalizationLayer* const layer = graph.AddLayer<BatchNormalizationLayer>(layerDesc, "layer");
+
+ armnn::TensorInfo weightInfo({3}, armnn::DataType::Float32);
+ layer->m_Mean = std::make_unique<ScopedCpuTensorHandle>(weightInfo);
+ layer->m_Variance = std::make_unique<ScopedCpuTensorHandle>(weightInfo);
+ layer->m_Beta = std::make_unique<ScopedCpuTensorHandle>(weightInfo);
+ layer->m_Gamma = std::make_unique<ScopedCpuTensorHandle>(weightInfo);
+ layer->m_Mean->Allocate();
+ layer->m_Variance->Allocate();
+ layer->m_Beta->Allocate();
+ layer->m_Gamma->Allocate();
+
+ // create extra layers
+ Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+ Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
+
+ // connect up
+ armnn::TensorInfo tensorInfo({2, 3, 1, 1}, armnn::DataType::Float32);
+ Connect(input, layer, tensorInfo);
+ Connect(layer, output, tensorInfo);
+
+ // check the constants that they are not NULL
+ BOOST_CHECK(layer->m_Mean != nullptr);
+ BOOST_CHECK(layer->m_Variance != nullptr);
+ BOOST_CHECK(layer->m_Beta != nullptr);
+ BOOST_CHECK(layer->m_Gamma != nullptr);
+
+ // free up the constants..
+ layer->ReleaseConstantData();
+
+ // check the constants that they are NULL now
+ BOOST_CHECK(layer->m_Mean == nullptr);
+ BOOST_CHECK(layer->m_Variance == nullptr);
+ BOOST_CHECK(layer->m_Beta == nullptr);
+ BOOST_CHECK(layer->m_Gamma == nullptr);
+
+ }
+
+
+ BOOST_AUTO_TEST_CASE(ReleaseConvolution2dLayerConstantDataTest)
+ {
+ Graph graph;
+ ClWorkloadFactory factory;
+
+ // create the layer we're testing
+ Convolution2dDescriptor layerDesc;
+ layerDesc.m_PadLeft = 3;
+ layerDesc.m_PadRight = 3;
+ layerDesc.m_PadTop = 1;
+ layerDesc.m_PadBottom = 1;
+ layerDesc.m_StrideX = 2;
+ layerDesc.m_StrideY = 4;
+ layerDesc.m_BiasEnabled = true;
+
+ Convolution2dLayer* const layer = graph.AddLayer<Convolution2dLayer>(layerDesc, "layer");
+
+ layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2, 3, 5, 3},
+ armnn::DataType::Float32));
+ layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>
+ (TensorInfo({2}, GetBiasDataType(armnn::DataType::Float32)));
+
+ layer->m_Weight->Allocate();
+ layer->m_Bias->Allocate();
+
+ // create extra layers
+ Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+ Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
+
+ // connect up
+ Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32));
+ Connect(layer, output, TensorInfo({2, 2, 2, 10}, armnn::DataType::Float32));
+
+ // check the constants that they are not NULL
+ BOOST_CHECK(layer->m_Weight != nullptr);
+ BOOST_CHECK(layer->m_Bias != nullptr);
+
+ // free up the constants..
+ layer->ReleaseConstantData();
+
+ // check the constants that they are NULL now
+ BOOST_CHECK(layer->m_Weight == nullptr);
+ BOOST_CHECK(layer->m_Bias == nullptr);
+}
+
+BOOST_AUTO_TEST_CASE(ReleaseDepthwiseConvolution2dLayerConstantDataTest)
+{
+ Graph graph;
+ ClWorkloadFactory factory;
+
+ // create the layer we're testing
+ DepthwiseConvolution2dDescriptor layerDesc;
+ layerDesc.m_PadLeft = 3;
+ layerDesc.m_PadRight = 3;
+ layerDesc.m_PadTop = 1;
+ layerDesc.m_PadBottom = 1;
+ layerDesc.m_StrideX = 2;
+ layerDesc.m_StrideY = 4;
+ layerDesc.m_BiasEnabled = true;
+
+ DepthwiseConvolution2dLayer* const layer = graph.AddLayer<DepthwiseConvolution2dLayer>(layerDesc, "layer");
+
+ layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({3, 3, 5, 3}, DataType::Float32));
+ layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({9}, DataType::Float32));
+ layer->m_Weight->Allocate();
+ layer->m_Bias->Allocate();
+
+ // create extra layers
+ Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+ Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
+
+ // connect up
+ Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32));
+ Connect(layer, output, TensorInfo({2, 9, 2, 10}, armnn::DataType::Float32));
+
+ // check the constants that they are not NULL
+ BOOST_CHECK(layer->m_Weight != nullptr);
+ BOOST_CHECK(layer->m_Bias != nullptr);
+
+ // free up the constants..
+ layer->ReleaseConstantData();
+
+ // check the constants that they are NULL now
+ BOOST_CHECK(layer->m_Weight == nullptr);
+ BOOST_CHECK(layer->m_Bias == nullptr);
+}
+
+BOOST_AUTO_TEST_CASE(ReleaseFullyConnectedLayerConstantDataTest)
+{
+ Graph graph;
+ ClWorkloadFactory factory;
+
+ // create the layer we're testing
+ FullyConnectedDescriptor layerDesc;
+ layerDesc.m_BiasEnabled = true;
+ layerDesc.m_TransposeWeightMatrix = true;
+
+ FullyConnectedLayer* const layer = graph.AddLayer<FullyConnectedLayer>(layerDesc, "layer");
+
+ float inputsQScale = 1.0f;
+ float outputQScale = 2.0f;
+
+ layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7, 20},
+ DataType::QuantisedAsymm8, inputsQScale, 0));
+ layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7},
+ GetBiasDataType(DataType::QuantisedAsymm8), inputsQScale));
+ layer->m_Weight->Allocate();
+ layer->m_Bias->Allocate();
+
+ // create extra layers
+ Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+ Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
+
+ // connect up
+ Connect(input, layer, TensorInfo({3, 1, 4, 5}, DataType::QuantisedAsymm8, inputsQScale));
+ Connect(layer, output, TensorInfo({3, 7}, DataType::QuantisedAsymm8, outputQScale));
+
+ // check the constants that they are not NULL
+ BOOST_CHECK(layer->m_Weight != nullptr);
+ BOOST_CHECK(layer->m_Bias != nullptr);
+
+ // free up the constants..
+ layer->ReleaseConstantData();
+
+ // check the constants that they are NULL now
+ BOOST_CHECK(layer->m_Weight == nullptr);
+ BOOST_CHECK(layer->m_Bias == nullptr);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
diff --git a/src/armnn/backends/test/LayerTests.cpp b/src/armnn/backends/test/LayerTests.cpp
index a10e4bd7a0..8039ffb9b1 100644
--- a/src/armnn/backends/test/LayerTests.cpp
+++ b/src/armnn/backends/test/LayerTests.cpp
@@ -35,8 +35,11 @@
#include "SoftmaxTestImpl.hpp"
#include "NormTestImpl.hpp"
#include "PermuteTestImpl.hpp"
+#include "LstmTestImpl.hpp"
+#include "ConvertFp16ToFp32TestImpl.hpp"
+#include "ConvertFp32ToFp16TestImpl.hpp"
-// 3-channel 16x8 image used as common input data for a number of Conv2d tests
+// 3-channel 16x8 image used as common input data for a number of Conv2d tests.
static std::vector<float> ConvInput3x8x16({
0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
@@ -64,10 +67,10 @@ static std::vector<float> ConvInput3x8x16({
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
});
-// 2-channel bias used by a number of Conv2d tests
+// 2-channel bias used by a number of Conv2d tests.
static std::vector<float> Bias2({0, 2});
-// Helper function that returns either Bias2 or an empty vector depending on whether bias is enabled
+// Helper function that returns either Bias2 or an empty vector depending on whether bias is enabled.
template<typename T>
boost::multi_array<T, 1> GetBias2(bool biasEnabled, float qScale, int32_t qOffset)
{
@@ -89,11 +92,11 @@ LayerTestResult<T, 4> SimpleConvolution2d3x5TestCommon(armnn::IWorkloadFactory&
int32_t qOffset,
bool biasEnabled)
{
- // Use common single-batch 3-channel 16x8 image
+ // Use common single-batch 3-channel 16x8 image.
armnn::TensorInfo inputDesc({1, 3, 8, 16}, armnn::GetDataType<T>());
boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, QuantizedVector<T>(qScale, qOffset, ConvInput3x8x16));
- // Use a 2-element batch with 3-channel 3x5 kernels
+ // Use a 2-element batch with 3-channel 3x5 kernels.
armnn::TensorInfo kernelDesc({2, 3, 5, 3}, armnn::GetDataType<T>());
boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -135,7 +138,7 @@ LayerTestResult<T, 4> SimpleConvolution2d3x5TestCommon(armnn::IWorkloadFactory&
0, 0, 0
})));
- // Expected output is 2 batch elements of a 1-channel 14x4 image
+ // Expected output is 2 batch elements of a 1-channel 14x4 image.
armnn::TensorInfo outputDesc({1, 2, 4, 14}, armnn::GetDataType<T>());
boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -167,13 +170,13 @@ LayerTestResult<T, 4> SimpleConvolution2d3x3TestCommon(armnn::IWorkloadFactory&
int32_t qOffset,
bool biasEnabled)
{
- // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path
+ // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path.
- // Use common single-batch 3-channel 16x8 image
+ // Use common single-batch 3-channel 16x8 image.
armnn::TensorInfo inputDesc({1, 3, 8, 16}, armnn::GetDataType<T>());
boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, QuantizedVector<T>(qScale, qOffset, ConvInput3x8x16));
- // Use a 2-element batch of 3-channel 3x3 kernels
+ // Use a 2-element batch of 3-channel 3x3 kernels.
armnn::TensorInfo kernelDesc({2, 3, 3, 3}, armnn::GetDataType<T>());
boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -203,7 +206,7 @@ LayerTestResult<T, 4> SimpleConvolution2d3x3TestCommon(armnn::IWorkloadFactory&
0, 0, 0
})));
- // Expected output is 1 batch of a 2-channel 14x6 image
+ // Expected output is 1 batch of a 2-channel 14x6 image.
armnn::TensorInfo outputDesc({1, 2, 6, 14}, armnn::GetDataType<T>());
boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -261,7 +264,7 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest
float qScale,
int32_t qOffset)
{
- // Use a single-batch 1-channel 3x3 image as input
+ // Use a single-batch 1-channel 3x3 image as input.
armnn::TensorInfo inputDesc({1, 1, 3, 3}, armnn::GetDataType<T>());
boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -270,7 +273,7 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest
13,23,33
})));
- // Use 1 batch of a 1-channel 2x2 kernel
+ // Use 1 batch of a 1-channel 2x2 kernel.
armnn::TensorInfo kernelDesc({1, 1, 2, 2}, armnn::GetDataType<T>());
boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -278,7 +281,7 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest
-12,-22,
})));
-// Expected output is 1 batch of a 1-channel 6x8 image
+// Expected output is 1 batch of a 1-channel 6x8 image.
// Manually calculated like this:
//[-11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ..]
//[-11*0 -21*0 -12*0 -22*11 ; -11*0 -21*0 -12*11 -22*21 ; -11*0 -21*0 -12*21 -22*31 ; -11*0 -21*0 -12*31 -22*0 ..]
@@ -307,10 +310,10 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest
expectedOutput,
qScale,
qOffset,
- 1, // padding left
- 2, // padding top
- 3, // padding right
- 4); // padding bottom
+ 1, // Padding left.
+ 2, // Padding top.
+ 3, // Padding right.
+ 4); // Padding bottom.
}
template<typename T>
@@ -318,7 +321,7 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor
float qScale,
int32_t qOffset)
{
- // Use a single-batch 1-channel 5x5 image as input
+ // Use a single-batch 1-channel 5x5 image as input.
armnn::TensorInfo inputDesc({ 1, 1, 5, 5 }, armnn::GetDataType<T>());
boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -329,7 +332,7 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor
15,25,35,45,55,
})));
- // Use 1 batch of a 1-channel 4x4 kernel
+ // Use 1 batch of a 1-channel 4x4 kernel.
armnn::TensorInfo kernelDesc({ 1, 1, 4, 4 }, armnn::GetDataType<T>());
boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
@@ -339,7 +342,7 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor
-14,-24,-34,-44,
})));
- // Expected output is 1 batch of a 1-channel 5x5 image
+ // Expected output is 1 batch of a 1-channel 5x5 image.
armnn::TensorInfo outputDesc({ 1, 1, 5, 5 }, armnn::GetDataType<T>());
std::vector<T> myVec(outputDesc.GetNumElements(), 0);
boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>(
@@ -358,10 +361,10 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor
expectedOutput,
qScale,
qOffset,
- 1, // padding left
- 1, // padding top
- 2, // padding right
- 2); // padding bottom
+ 1, // Padding left.
+ 1, // Padding top.
+ 2, // Padding right.
+ 2); // Padding bottom.
}
template<typename T>
@@ -370,7 +373,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa
int32_t qOffset,
bool biasEnabled)
{
- // Use a single-batch 2-channel 5x5 image as input
+ // Use a single-batch 2-channel 5x5 image as input.
armnn::TensorInfo inputTensorInfo({ 1, 2, 5, 5 }, armnn::GetDataType<T>());
auto input = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>(
QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(), {
@@ -387,7 +390,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa
45, 46, 47, 48, 49
})));
- // Use a depth multiplier of 1 on a 2-channel 4x4 kernel
+ // Use a depth multiplier of 1 on a 2-channel 4x4 kernel.
armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, armnn::GetDataType<T>());
auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), {
@@ -402,8 +405,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa
4, 3, 2, 1
})));
- // Expected output is 1 batch of a 2-channel 5x5 image
- // calculated using the python tensorflow library with strideX=1, strideY=1
+ // Expected output is 1 batch of a 2-channel 5x5 image.
+ // Calculated using the python tensorflow library with strideX=1, strideY=1.
armnn::TensorInfo outputTensorInfo({ 1, 2, 5, 5 }, armnn::GetDataType<T>());
boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputTensorInfo, std::vector<T>(
QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), {
@@ -426,10 +429,10 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa
expectedOutput,
qScale,
qOffset,
- 1, // padding left
- 1, // padding top
- 2, // padding right
- 2, // padding bottom
+ 1, // Padding left.
+ 1, // Padding top.
+ 2, // Padding right.
+ 2, // Padding bottom.
1, // strideX
1); // strideY
}
@@ -569,6 +572,55 @@ LayerTestResult<uint8_t, 3> CopyViaSplitterUint8Test(armnn::IWorkloadFactory& wo
return CopyViaSplitterTestImpl<uint8_t>(workloadFactory, 1.0f, 0);
}
+LayerTestResult<float, 2> LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest(
+ armnn::IWorkloadFactory& workloadFactory)
+{
+ armnn::TensorInfo inputDesc({ 2, 2 }, armnn::GetDataType<float>());
+ boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+ { 2., 3., 3., 4. }));
+
+ armnn::TensorInfo outputDesc({ 2, 4 }, armnn::GetDataType<float>());
+ boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>(
+ {-0.36444446f, -0.00352185f, 0.12886585f, -0.05163646f,
+ -0.42734814f, -0.00478661f, 0.13455015f, -0.03560682f}));
+ return LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(workloadFactory, input, expectedOutput);
+}
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(
+ armnn::IWorkloadFactory& workloadFactory)
+{
+ armnn::TensorInfo inputDesc({ 2, 5 }, armnn::GetDataType<float>());
+ boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+ {0.787926f, 0.151646f, 0.071352f, 0.118426f, 0.458058f,
+ 0.295743f, 0.544053f, 0.690064f, 0.858138f, 0.497181f}));
+
+ armnn::TensorInfo outputDesc({ 2, 16 }, armnn::GetDataType<float>());
+ boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>(
+ {-0.00396806f, 0.029352f, -0.00279226f, 0.0159977f, -0.00835576f,
+ -0.0211779f, 0.0283512f, -0.0114597f, 0.00907307f, -0.0244004f,
+ -0.0152191f, -0.0259063f, 0.00914318f, 0.00415118f, 0.017147f,
+ 0.0134203f, -0.013869f, 0.0287268f, -0.00334693f, 0.00733398f, -0.0287926f,
+ -0.0186926f, 0.0193662f, -0.0115437f, 0.00422612f, -0.0345232f,
+ 0.00223253f, -0.00957321f, 0.0210624f, 0.013331f, 0.0150954f,
+ 0.02168f}));
+ return LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(workloadFactory, input, expectedOutput);
+}
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(armnn::IWorkloadFactory& workloadFactory)
+{
+ armnn::TensorInfo inputDesc({2, 2}, armnn::GetDataType<float>());
+ boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+ {2., 3., 3., 4.}));
+
+
+ armnn::TensorInfo outputDesc({2, 4}, armnn::GetDataType<float>());
+ boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>(
+ {{-0.02973187f, 0.1229473f, 0.20885126f, -0.15358765f,
+ -0.0185422f, 0.11281417f, 0.24466537f, -0.1826292f}}));
+
+ return LstmNoCifgNoPeepholeNoProjectionTestImpl(workloadFactory, input, expectedOutput);
+}
+
LayerTestResult<float,3> MergerTest(armnn::IWorkloadFactory& workloadFactory)
{
unsigned int outputWidth = 3;
@@ -583,7 +635,7 @@ LayerTestResult<float,3> MergerTest(armnn::IWorkloadFactory& workloadFactory)
unsigned int inputHeight2 = 6;
unsigned int inputChannels2 = 1;
- // Define the tensor descriptors
+ // Define the tensor descriptors.
armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::Float32);
armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::Float32);
armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::Float32);
@@ -644,10 +696,10 @@ LayerTestResult<float,3> MergerTest(armnn::IWorkloadFactory& workloadFactory)
})
);
- std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //extent of the window is defined by size of input[0]
+ std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of input[0].
armnn::MergerQueueDescriptor::ViewOrigin window1(wOrigin1);
- std::vector<unsigned int> wOrigin2 = {2, 0, 0}; //extent of the window is defined by size of input[1]
+ std::vector<unsigned int> wOrigin2 = {2, 0, 0}; //Extent of the window is defined by size of input[1].
armnn::MergerQueueDescriptor::ViewOrigin window2(wOrigin2);
std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
@@ -1350,7 +1402,7 @@ armnn::OriginsDescriptor CreateMergerDescriptorForConcatenation(
//
// Concatenation is only supported for N and C dimensions for NCHW. In case of
-// <4 dimensions we need to make sure that the concat dimensions is at least
+// <4 dimensions we need to make sure that the concat dimensions are at least
// the 3rd slowest iterating one.
//
@@ -1362,8 +1414,8 @@ bool NeedPermuteForConcat(
// same number of dimensions.
unsigned int nDimensions = 0;
- // determine the number of dimensions as well as sanity check them
- // agains test implementation issues
+ // Determine the number of dimensions as well as sanity check them
+ // agains test implementation issues.
for (auto && tensorInfo : inputTensorInfos)
{
if (!nDimensions)
@@ -1464,7 +1516,7 @@ void PermuteInputsForConcat(
{
numDims = tensorInfo.GetShape().GetNumDimensions();
Generate3dPermuteVectorForConcat(numDims, concatDim, permutations);
- // store the reverese permutation
+ // Store the reverese permutation.
permuteVector = permutations.second;
BOOST_ASSERT_MSG(!permuteVector.IsEqual(identity),
"Test logic error, we don't need permutation, so we shouldn't arrive here");
@@ -1499,7 +1551,7 @@ void PermuteInputsForConcat(
//
// This is the pair of PermuteInputsForConcat(...) which permutes back
-// the output of the concatenation so we can check against an expected
+// the output of the concatenation so we can check it against an expected
// output.
//
template <typename T>
@@ -1553,14 +1605,14 @@ void Concatenate(armnn::IWorkloadFactory& workloadFactory,
armnn::MergerQueueDescriptor queueDescriptor;
- // save a copy of the parameters which we might need to change
+ // Saves a copy of the parameters which we might need to change.
std::vector<armnn::TensorInfo> inputTensorInfos(inputTensorInfosOrig.begin(), inputTensorInfosOrig.end());
std::vector<T *> inputs = inputsOrig;
armnn::TensorInfo outputTensorInfo = outputTensorInfoOrig;
armnn::PermutationVector permuteVector{0, 1, 2};
- // hold and automatically release memory for the reshaped input data
+ // Holds and automatically releases memory for the reshaped input data.
std::vector<std::vector<T>> tmpInputDataStorage;
const size_t inputCount = inputTensorInfos.size();
@@ -1571,7 +1623,7 @@ void Concatenate(armnn::IWorkloadFactory& workloadFactory,
{
//
// We need to permute the inputs, because concatenation along
- // the requested axis is not supported
+ // the requested axis is not supported.
//
PermuteInputsForConcat<T>(workloadFactory,
inputTensorInfos,
@@ -2641,7 +2693,7 @@ LayerTestResult<float, 4> SimpleResizeBilinearTest(armnn::IWorkloadFactory& work
// The 'resize bilinear' operation projects the top-left corner of output texels into the input image,
// then figures out the interpolants and weights. Note this is different to projecting the centre of the
- // output texel - and thus we'll expect the output 1x1 matrix to contain as its single element the value
+ // output texel - and thus we'll expect the output 1x1 matrix to contain, as its single element, the value
// that was at position (0,0) of the input matrix (rather than an average, which we would expect if projecting
// the centre).
LayerTestResult<float, 4> result(outputTensorInfo);
@@ -3367,12 +3419,12 @@ LayerTestResult<uint8_t, 3> MergerUint8Test(armnn::IWorkloadFactory& workloadFac
unsigned int inputHeight2 = 6;
unsigned int inputChannels2 = 1;
- // Define the tensor descriptors
+ // Defines the tensor descriptors.
armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::QuantisedAsymm8);
armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::QuantisedAsymm8);
armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::QuantisedAsymm8);
- // Arbitrary scale and offsets. They don't really matter as the merger operator doesn't dequantize/quantize
+ // Arbitrary scale and offsets. They don't really matter as the merger operator doesn't dequantize/quantize them.
const float scale = 0.13497836f;
const int32_t offset = -7;
@@ -3439,10 +3491,10 @@ LayerTestResult<uint8_t, 3> MergerUint8Test(armnn::IWorkloadFactory& workloadFac
})
);
- std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //extent of the window is defined by size of input[0]
+ std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //Extent of the window is defined by size of input[0].
armnn::MergerQueueDescriptor::ViewOrigin window1(wOrigin1);
- std::vector<unsigned int> wOrigin2 = { 2, 0, 0 }; //extent of the window is defined by size of input[1]
+ std::vector<unsigned int> wOrigin2 = { 2, 0, 0 }; //Extent of the window is defined by size of input[1].
armnn::MergerQueueDescriptor::ViewOrigin window2(wOrigin2);
@@ -3513,21 +3565,21 @@ LayerTestResult<uint8_t, 4> AdditionUint8Test(armnn::IWorkloadFactory& workloadF
outputTensorInfo.SetQuantizationScale(scale);
outputTensorInfo.SetQuantizationOffset(offset);
- // See dequantized values to the right
+ // See dequantized values to the right.
auto input1 = MakeTensor<uint8_t, 4>(inputTensorInfo1, std::vector<uint8_t>(
{
63, 35, 77, 70, 56, 112, // 420, 224, 518, 469, 371, 763
203, 28, 252, 168, 245, 91 // 1400, 175, 1743, 1155, 1694, 616
}));
- // See dequantized values to the right
+ // See dequantized values to the right.
auto input2 = MakeTensor<uint8_t, 4>(inputTensorInfo1, std::vector<uint8_t>(
{
21, 7, 175, 231, 175, 210, // 126, 28, 1204, 1596, 1204, 1449
126, 161, 63, 21, 105, 126 // 861, 1106, 420, 126, 714, 861
}));
- // See dequantized values to the right
+ // See dequantized values to the right.
LayerTestResult<uint8_t, 4> result(outputTensorInfo);
result.outputExpected = MakeTensor<uint8_t, 4>(outputTensorInfo, std::vector<uint8_t>(
{
@@ -3633,19 +3685,19 @@ LayerTestResult<uint8_t, 4> MultiplicationUint8Test(armnn::IWorkloadFactory& wor
unsigned int width = 3;
const unsigned int shape[] = { batchSize, channels, height, width };
- // See dequantized values to the right
+ // See dequantized values to the right.
std::vector<uint8_t> input0({
62, 37, 3, 172, 13, 111, // 244, 144, 8, 684, 48, 440,
188, 20, 73, 31, 23, 31 // 748, 76, 288, 120, 88, 120
});
- // See dequantized values to the right
+ // See dequantized values to the right.
std::vector<uint8_t> input1({
126, 240, 252, 183, 121, 247, // 384, 726, 762, 555, 369, 747,
48, 115, 151, 79, 78, 97 // 150, 351, 459, 243, 240, 297
});
- // See dequantized values to the right
+ // See dequantized values to the right.
std::vector<uint8_t> output(
{
64, 72, 0, 255, 8, 236, // 93696, 104544, 6096(clamped), 379620(clamped), 17712, 328680,
@@ -3663,7 +3715,7 @@ LayerTestResult<uint8_t, 4> MultiplicationUint8Test(armnn::IWorkloadFactory& wor
-2,
shape,
output,
- 1366.255f, // Scale/offset chosen to have output values out of range
+ 1366.255f, // Scale/offset chosen to have output values out of range.
-5);
}
@@ -3813,7 +3865,7 @@ LayerTestResult<uint8_t, 4> SimpleResizeBilinearUint8Test(armnn::IWorkloadFactor
// The 'resize bilinear' operation projects the top-left corner of output texels into the input image,
// then figures out the interpolants and weights. Note this is different to projecting the centre of the
- // output texel - and thus we'll expect the output 1x1 matrix to contain as its single element the value
+ // output texel - and thus we'll expect the output 1x1 matrix to contain, as its single element, the value
// that was at position (0,0) of the input matrix (rather than an average, which we would expect if projecting
// the centre).
LayerTestResult<uint8_t, 4> result(outputTensorInfo);
@@ -4314,4 +4366,4 @@ LayerTestResult<float, 4> PermuteFloat32ValueSet2Test(armnn::IWorkloadFactory& w
LayerTestResult<float, 4> PermuteFloat32ValueSet3Test(armnn::IWorkloadFactory& workloadFactory)
{
return PermuteFloat32ValueSet3TestCommon(workloadFactory);
-};
+}; \ No newline at end of file
diff --git a/src/armnn/backends/test/LayerTests.hpp b/src/armnn/backends/test/LayerTests.hpp
index 2d543d61de..48f73e7693 100644
--- a/src/armnn/backends/test/LayerTests.hpp
+++ b/src/armnn/backends/test/LayerTests.hpp
@@ -6,12 +6,13 @@
#include "armnn/ArmNN.hpp"
#include "armnn/Tensor.hpp"
+#include "Half.hpp"
#include <boost/multi_array.hpp>
#include <boost/assert.hpp>
#include <array>
-// Layer callables
+// Layer callables.
namespace armnn
{
@@ -213,20 +214,20 @@ LayerTestResult<float, 4> CompareBoundedReLuTest(armnn::IWorkloadFactory& worklo
float upperBound,
float lowerBound);
-// Tests that the output should be identical to the input when the output dimensions match the input ones
+// Tests that the output should be identical to the input when the output dimensions match the input ones.
LayerTestResult<float, 4> ResizeBilinearNopTest(armnn::IWorkloadFactory& workloadFactory);
-// Tests the behaviour of the resize bilinear operation when rescaling a 2x2 image into a 1x1 image
+// Tests the behaviour of the resize bilinear operation when rescaling a 2x2 image into a 1x1 image.
LayerTestResult<float, 4> SimpleResizeBilinearTest(armnn::IWorkloadFactory& workloadFactory);
-// Tests resize bilinear for minification of a square input matrix (also: input dimensions are a
-// multiple of output dimensions)
+// Tests the resize bilinear for minification of a square input matrix (also: input dimensions are a
+// multiple of output dimensions).
LayerTestResult<float, 4> ResizeBilinearSqMinTest(armnn::IWorkloadFactory& workloadFactory);
-// Tests resize bilinear for minification (output dimensions smaller than input dimensions)
+// Tests the resize bilinear for minification (output dimensions smaller than input dimensions).
LayerTestResult<float, 4> ResizeBilinearMinTest(armnn::IWorkloadFactory& workloadFactory);
-// Tests resize bilinear for magnification (output dimensions bigger than input dimensions)
+// Tests the resize bilinear for magnification (output dimensions bigger than input dimensions).
LayerTestResult<float, 4> ResizeBilinearMagTest(armnn::IWorkloadFactory& workloadFactory);
LayerTestResult<float, 4> BatchNormTest(armnn::IWorkloadFactory& workloadFactory);
@@ -315,3 +316,13 @@ LayerTestResult<uint8_t, 4> SimplePermuteUint8Test(armnn::IWorkloadFactory& work
LayerTestResult<float, 4> PermuteFloat32ValueSet1Test(armnn::IWorkloadFactory& workloadFactory);
LayerTestResult<float, 4> PermuteFloat32ValueSet2Test(armnn::IWorkloadFactory& workloadFactory);
LayerTestResult<float, 4> PermuteFloat32ValueSet3Test(armnn::IWorkloadFactory& workloadFactory);
+
+LayerTestResult<float, 2> LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest
+ (armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 2>
+ LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 2>
+LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(armnn::IWorkloadFactory& workloadFactory);
+
+LayerTestResult<float, 4> SimpleConvertFp16ToFp32Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<armnn::Half, 4> SimpleConvertFp32ToFp16Test(armnn::IWorkloadFactory& workloadFactory);
diff --git a/src/armnn/backends/test/LstmTestImpl.hpp b/src/armnn/backends/test/LstmTestImpl.hpp
new file mode 100644
index 0000000000..7f67b020e2
--- /dev/null
+++ b/src/armnn/backends/test/LstmTestImpl.hpp
@@ -0,0 +1,1150 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#pragma once
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/Tensor.hpp>
+#include <armnn/TypesUtils.hpp>
+
+#include "test/TensorHelpers.hpp"
+#include "QuantizeHelper.hpp"
+
+#include "backends/CpuTensorHandle.hpp"
+#include <backends/WorkloadInfo.hpp>
+#include "backends/WorkloadFactory.hpp"
+
+LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory,
+ const boost::multi_array<float, 2>& input,
+ const boost::multi_array<float, 2>& outputExpected)
+{
+ unsigned int batchSize = boost::numeric_cast<unsigned int>(input.shape()[0]);
+ unsigned int inputSize = boost::numeric_cast<unsigned int>(input.shape()[1]);
+ unsigned int outputSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+ // cellSize and outputSize have the same size when there is no projection.
+ unsigned numUnits = outputSize;
+
+
+ armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>());
+
+
+ armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>());
+ armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
+
+
+ LayerTestResult<float, 2> ret(outputTensorInfo);
+
+ std::vector<float> inputVector;
+ inputVector.assign(input.data(), input.data() + (batchSize * inputSize));
+ auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputVector);
+
+ std::vector<float> cellStateInVector(batchSize * numUnits, 0.f);
+ auto cellStateInTensor = MakeTensor<float,2>(cellStateInTensorInfo, cellStateInVector);
+
+ std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
+ auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
+
+ std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f);
+ auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
+
+ std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
+ auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector);
+
+ std::vector<float> cellStateOutVector(batchSize * numUnits, 0.f);
+ auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector);
+
+ std::vector<float> outputVector;
+ outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize));
+ ret.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputVector);
+
+ std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+ workloadFactory.CreateTensorHandle(cellStateInTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+ workloadFactory.CreateTensorHandle(outputStateInTensorInfo);
+
+ std::unique_ptr<armnn::ITensorHandle> scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle =
+ workloadFactory.CreateTensorHandle(outputStateOutTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+ workloadFactory.CreateTensorHandle(cellStateOutTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+
+ armnn::LstmQueueDescriptor data;
+ armnn::WorkloadInfo info;
+
+ AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+ AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get());
+ AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get());
+
+ AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get());
+ AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get());
+ AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get());
+ AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+ armnn::TensorInfo tensorInfo4({numUnits}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfo8({numUnits, 2}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfo16({numUnits, 4}, armnn::GetDataType<float>());
+
+ auto inputToInputWeights = MakeTensor<float, 2>(tensorInfo8, {-0.45018822f, -0.02338299f, -0.0870589f,
+ -0.34550029f, 0.04266912f, -0.15680569f,
+ -0.34856534f, 0.43890524f});
+
+ auto inputToForgetWeights = MakeTensor<float, 2>(tensorInfo8, {0.09701663f, 0.20334584f, -0.50592935f,
+ -0.31343272f, -0.40032279f, 0.44781327f,
+ 0.01387155f, -0.35593212f});
+
+ auto inputToCellWeights = MakeTensor<float, 2>(tensorInfo8, {-0.50013041f, 0.1370284f, 0.11810488f, 0.2013163f,
+ -0.20583314f, 0.44344562f, 0.22077113f,
+ -0.29909778f});
+
+ auto inputToOutputWeights = MakeTensor<float, 2>(tensorInfo8, {-0.25065863f, -0.28290087f, 0.04613829f,
+ 0.40525138f, 0.44272184f, 0.03897077f,
+ -0.1556896f, 0.19487578f});
+
+ auto recurrentToInputWeights = MakeTensor<float, 2>(tensorInfo16, {-0.0063535f, -0.2042388f, 0.31454784f,
+ -0.35746509f, 0.28902304f, 0.08183324f,
+ -0.16555229f, 0.02286911f, -0.13566875f,
+ 0.03034258f, 0.48091322f, -0.12528998f,
+ 0.24077177f, -0.51332325f, -0.33502164f,
+ 0.10629296f});
+
+ auto recurrentToForgetWeights = MakeTensor<float, 2>(tensorInfo16, {-0.48684245f, -0.06655136f, 0.42224967f,
+ 0.2112639f, 0.27654213f, 0.20864892f,
+ -0.07646349f, 0.45877004f, 0.00141793f,
+ -0.14609534f, 0.36447752f, 0.09196436f,
+ 0.28053468f, 0.01560611f, -0.20127171f,
+ -0.01140004f});
+
+ auto recurrentToCellWeights = MakeTensor<float, 2>(tensorInfo16, {-0.3407414f, 0.24443203f, -0.2078532f,
+ 0.26320225f, 0.05695659f, -0.00123841f,
+ -0.4744786f, -0.35869038f, -0.06418842f,
+ -0.13502428f, -0.501764f, 0.22830659f,
+ -0.46367589f, 0.26016325f, -0.03894562f,
+ -0.16368064f});
+
+ auto recurrentToOutputWeights = MakeTensor<float, 2>(tensorInfo16, {0.43385774f, -0.17194885f, 0.2718237f,
+ 0.09215671f, 0.24107647f, -0.39835793f,
+ 0.18212086f, 0.01301402f, 0.48572797f,
+ -0.50656658f, 0.20047462f, -0.20607421f,
+ -0.51818722f, -0.15390486f, 0.0468148f,
+ 0.39922136f});
+
+ auto cellToInputWeights = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+ auto inputGateBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+ auto forgetGateBias = MakeTensor<float, 1>(tensorInfo4, {1., 1., 1., 1.});
+
+ auto cellBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+ auto outputGateBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+ armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo8);
+ armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo8);
+ armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo8);
+ armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo8);
+ armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16);
+ armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo16);
+ armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo16);
+ armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo16);
+ armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4);
+ armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo4);
+ armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo4);
+ armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo4);
+ armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo4);
+
+ AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]);
+ AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]);
+ AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+ AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+ AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+
+ data.m_InputToInputWeights = &inputToInputWeightsTensor;
+ data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+ data.m_InputToCellWeights = &inputToCellWeightsTensor;
+ data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+ data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor;
+ data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+ data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+ data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+ data.m_CellToInputWeights = &cellToInputWeightsTensor;
+ data.m_InputGateBias = &inputGateBiasTensor;
+ data.m_ForgetGateBias = &forgetGateBiasTensor;
+ data.m_CellBias = &cellBiasTensor;
+ data.m_OutputGateBias = &outputGateBiasTensor;
+
+
+ // Flags to set test configuration
+ data.m_Parameters.m_ActivationFunc = 4;
+ data.m_Parameters.m_CifgEnabled = false;
+ data.m_Parameters.m_PeepholeEnabled = false;
+ data.m_Parameters.m_ProjectionEnabled = false;
+
+
+ std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info);
+ inputHandle->Allocate();
+ outputStateInHandle->Allocate();
+ cellStateInHandle->Allocate();
+
+ scratchHandle->Allocate();
+ outputStateOutHandle->Allocate();
+ cellStateOutHandle->Allocate();
+ outputHandle->Allocate();
+
+ CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+ CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+ CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+ workloadFactory.Finalize();
+ workload->Execute();
+
+ CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+
+ return ret;
+}
+
+
+LayerTestResult<float, 2>
+LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory,
+ const boost::multi_array<float, 2>& input,
+ const boost::multi_array<float, 2>& outputExpected) {
+
+ unsigned int batchSize = 2;
+ unsigned int outputSize = 16;
+ unsigned int inputSize = 5;
+ unsigned numUnits = 20;
+
+ armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>());
+
+ // Scratch buffer size without CIFG [batchSize, numUnits * 3]
+ armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>());
+ armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
+
+ LayerTestResult<float, 2> ret(outputTensorInfo);
+
+ std::vector<float> inputVector;
+ inputVector.assign(input.data(), input.data() + (batchSize * inputSize));
+ auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputVector);
+
+ std::vector<float> cellStateInVector(batchSize * numUnits, 0.f);
+ auto cellStateInTensor = MakeTensor<float,2>(cellStateInTensorInfo, cellStateInVector);
+
+ std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
+ auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
+
+ std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f);
+ auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
+
+ std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
+ auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector);
+
+ std::vector<float> cellStateOutVector(batchSize * numUnits, 0.f);
+ auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector);
+
+ std::vector<float> outputVector;
+ outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize));
+ ret.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputVector);
+
+ std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+ workloadFactory.CreateTensorHandle(cellStateInTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+ workloadFactory.CreateTensorHandle(outputStateInTensorInfo);
+
+ std::unique_ptr<armnn::ITensorHandle> scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle =
+ workloadFactory.CreateTensorHandle(outputStateOutTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+ workloadFactory.CreateTensorHandle(cellStateOutTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+ armnn::LstmQueueDescriptor data;
+ armnn::WorkloadInfo info;
+
+ AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+ AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get());
+ AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get());
+
+ AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get());
+ AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get());
+ AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get());
+ AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+ armnn::TensorInfo tensorInfo16({outputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfo20({numUnits}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfo20x5({numUnits, inputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfo20x16({numUnits, outputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfo16x20({outputSize, numUnits}, armnn::GetDataType<float>());
+
+ auto inputToInputWeights =
+ MakeTensor<float, 2>(tensorInfo20x5, {0.021393683f,0.06124551f, 0.046905167f,-0.014657677f,-0.03149463f,
+ 0.09171803f, 0.14647801f,0.10797193f, -0.0057968358f,0.0019193048f,
+ -0.2726754f, 0.10154029f, -0.018539885f, 0.080349885f, -0.10262385f,
+ -0.022599787f,-0.09121155f, -0.008675967f, -0.045206103f,-0.0821282f,
+ -0.008045952f,0.015478081f, 0.055217247f, 0.038719587f, 0.044153627f,
+ -0.06453243f,0.05031825f, -0.046935108f, -0.008164439f, 0.014574226f,
+ -0.1671009f, -0.15519552f, -0.16819797f,-0.13971269f,-0.11953059f,
+ 0.25005487f, -0.22790983f, 0.009855087f, -0.028140958f, -0.11200698f,
+ 0.11295408f, -0.0035217577f, 0.054485075f, 0.05184695f, 0.064711206f,
+ 0.10989193f, 0.11674786f, 0.03490607f, 0.07727357f, 0.11390585f,
+ -0.1863375f, -0.1034451f, -0.13945189f, -0.049401227f, -0.18767063f,
+ 0.042483903f, 0.14233552f, 0.13832581f, 0.18350165f, 0.14545603f,
+ -0.028545704f,0.024939531f,0.050929718f,0.0076203286f,-0.0029723682f,
+ -0.042484224f, -0.11827596f, -0.09171104f, -0.10808628f,-0.16327988f,
+ -0.2273378f, -0.0993647f, -0.017155107f,0.0023917493f,0.049272764f,
+ 0.0038534778f, 0.054764505f, 0.089753784f, 0.06947234f, 0.08014476f,
+ -0.04544234f, -0.0497073f,-0.07135631f, -0.048929106f,-0.004042012f,
+ -0.009284026f, 0.018042054f, 0.0036860977f,-0.07427302f, -0.11434604f,
+ -0.018995456f, 0.031487543f, 0.012834908f,0.019977754f,0.044256654f,
+ -0.39292613f, -0.18519334f, -0.11651281f,-0.06809892f, 0.011373677f
+ });
+
+ auto inputToForgetWeights =
+ MakeTensor<float, 2>(tensorInfo20x5, {-0.0018401089f, -0.004852237f,0.03698424f, 0.014181704f,0.028273236f,
+ -0.016726194f, -0.05249759f,-0.10204261f, 0.00861066f,-0.040979505f,
+ -0.009899187f,0.01923892f,-0.028177269f, -0.08535103f,-0.14585495f,
+ 0.10662567f,-0.01909731f,-0.017883534f,-0.0047269356f,-0.045103323f,
+ 0.0030784295f,0.076784775f,0.07463696f, 0.094531395f,0.0814421f,
+ -0.12257899f, -0.033945758f,-0.031303465f, 0.045630626f,0.06843887f,
+ -0.13492945f, -0.012480007f,-0.0811829f, -0.07224499f,-0.09628791f,
+ 0.045100946f,0.0012300825f, 0.013964662f, 0.099372394f,0.02543059f,
+ 0.06958324f, 0.034257296f, 0.0482646f, 0.06267997f,0.052625068f,
+ 0.12784666f, 0.07077897f, 0.025725935f, 0.04165009f,0.07241905f,
+ 0.018668644f, -0.037377294f,-0.06277783f,-0.08833636f,-0.040120605f,
+ -0.011405586f,-0.007808335f,-0.010301386f,-0.005102167f,0.027717464f,
+ 0.05483423f, 0.11449111f, 0.11289652f,0.10939839f, 0.13396506f,
+ -0.08402166f,-0.01901462f, -0.044678304f,-0.07720565f,0.014350063f,
+ -0.11757958f, -0.0652038f, -0.08185733f,-0.076754324f,-0.092614375f,
+ 0.10405491f, 0.052960336f, 0.035755895f,0.035839386f,-0.012540553f,
+ 0.036881298f, 0.02913376f, 0.03420159f,0.05448447f,-0.054523353f,
+ 0.02582715f, 0.02327355f, -0.011857179f,-0.0011980024f,-0.034641717f,
+ -0.026125094f,-0.17582615f,-0.15923657f,-0.27486774f,-0.0006143371f,
+ 0.0001771948f, -8.470171e-05f, 0.02651807f,0.045790765f,0.06956496f
+ });
+
+ auto inputToCellWeights =
+ MakeTensor<float, 2>(tensorInfo20x5, {-0.04580283f, -0.09549462f, -0.032418985f, -0.06454633f,
+ -0.043528453f, 0.043018587f, -0.049152344f, -0.12418144f,
+ -0.078985475f, -0.07596889f, 0.019484362f, -0.11434962f,
+ -0.0074034138f, -0.06314844f, -0.092981495f, 0.0062155537f,
+ -0.025034338f, -0.0028890965f, 0.048929527f, 0.06235075f,
+ 0.10665918f, -0.032036792f, -0.08505916f, -0.10843358f,
+ -0.13002433f, -0.036816437f, -0.02130134f, -0.016518239f,
+ 0.0047691227f, -0.0025825808f, 0.066017866f, 0.029991534f,
+ -0.10652836f, -0.1037554f, -0.13056071f, -0.03266643f,
+ -0.033702414f, -0.006473424f, -0.04611692f, 0.014419339f,
+ -0.025174323f, 0.0396852f, 0.081777506f, 0.06157468f,
+ 0.10210095f, -0.009658194f, 0.046511717f, 0.03603906f,
+ 0.0069369148f, 0.015960095f, -0.06507666f, 0.09551598f,
+ 0.053568836f, 0.06408714f, 0.12835667f, -0.008714329f,
+ -0.20211966f, -0.12093674f, 0.029450472f, 0.2849013f,
+ -0.029227901f, 0.1164364f, -0.08560263f, 0.09941786f,
+ -0.036999565f, -0.028842626f, -0.0033637602f, -0.017012902f,
+ -0.09720865f, -0.11193351f, -0.029155117f, -0.017936034f,
+ -0.009768936f, -0.04223324f, -0.036159635f, 0.06505112f,
+ -0.021742892f, -0.023377212f, -0.07221364f, -0.06430552f,
+ 0.05453865f, 0.091149814f, 0.06387331f, 0.007518393f,
+ 0.055960953f, 0.069779344f, 0.046411168f, 0.10509911f,
+ 0.07463894f, 0.0075130584f, 0.012850982f, 0.04555431f,
+ 0.056955688f, 0.06555285f, 0.050801456f, -0.009862683f,
+ 0.00826772f, -0.026555609f, -0.0073611983f, -0.0014897042f
+ });
+
+ auto inputToOutputWeights =
+ MakeTensor<float, 2>(tensorInfo20x5, {-0.0998932f, -0.07201956f, -0.052803773f,-0.15629593f,-0.15001918f,
+ -0.07650751f,0.02359855f, -0.075155355f, -0.08037709f, -0.15093534f,
+ 0.029517552f, -0.04751393f, 0.010350531f,-0.02664851f, -0.016839722f,
+ -0.023121163f, 0.0077019283f, 0.012851257f, -0.05040649f,-0.0129761f,
+ -0.021737747f,-0.038305793f,-0.06870586f, -0.01481247f,-0.001285394f,
+ 0.10124236f, 0.083122835f, 0.053313006f,-0.062235646f,-0.075637154f,
+ -0.027833903f, 0.029774971f, 0.1130802f, 0.09218906f, 0.09506135f,
+ -0.086665764f,-0.037162706f,-0.038880914f,-0.035832845f,-0.014481564f,
+ -0.09825003f,-0.12048569f,-0.097665586f,-0.05287633f, -0.0964047f,
+ -0.11366429f, 0.035777505f, 0.13568819f, 0.052451383f,0.050649304f,
+ 0.05798951f, -0.021852335f,-0.099848844f,0.014740475f,-0.078897946f,
+ 0.04974699f, 0.014160473f, 0.06973932f, 0.04964942f, 0.033364646f,
+ 0.08190124f, 0.025535367f, 0.050893165f, 0.048514254f,0.06945813f,
+ -0.078907564f,-0.06707616f, -0.11844508f, -0.09986688f,-0.07509403f,
+ 0.06263226f, 0.14925587f, 0.20188436f, 0.12098451f,0.14639415f,
+ 0.0015017595f, -0.014267382f, -0.03417257f,0.012711468f,0.0028300495f,
+ -0.024758482f, -0.05098548f,-0.0821182f, 0.014225672f, 0.021544158f,
+ 0.08949725f, 0.07505268f, -0.0020780868f, 0.04908258f,0.06476295f,
+ -0.022907063f,0.027562456f,0.040185735f, 0.019567577f,-0.015598739f,
+ -0.049097303f, -0.017121866f, -0.083368234f,-0.02332002f,-0.0840956f
+ });
+
+ auto inputGateBias =
+ MakeTensor<float, 1>(tensorInfo20, {0.02234832f, 0.14757581f, 0.18176508f, 0.10380666f, 0.053110216f,
+ -0.06928846f, -0.13942584f, -0.11816189f, 0.19483899f, 0.03652339f,
+ -0.10250295f, 0.036714908f, -0.18426876f, 0.036065217f, 0.21810818f,
+ 0.02383196f, -0.043370757f, 0.08690144f, -0.04444982f, 0.00030581196f
+ });
+
+ auto forgetGateBias =
+ MakeTensor<float, 1>(tensorInfo20, {0.035185695f, -0.042891346f, -0.03032477f, 0.23027696f,
+ 0.11098921f, 0.15378423f, 0.09263801f, 0.09790885f,
+ 0.09508917f, 0.061199076f, 0.07665568f, -0.015443159f,
+ -0.03499149f, 0.046190713f, 0.08895977f, 0.10899629f,
+ 0.40694186f, 0.06030037f, 0.012413437f, -0.06108739f
+ });
+
+ auto cellBias =
+ MakeTensor<float, 1>(tensorInfo20, {-0.024379363f, 0.0055531194f, 0.23377132f, 0.033463873f,
+ -0.1483596f, -0.10639995f, -0.091433935f, 0.058573797f,
+ -0.06809782f, -0.07889636f, -0.043246906f, -0.09829136f,
+ -0.4279842f, 0.034901652f, 0.18797937f, 0.0075234566f,
+ 0.016178843f, 0.1749513f, 0.13975595f, 0.92058027f
+ });
+
+ auto outputGateBias =
+ MakeTensor<float, 1>(tensorInfo20, {0.046159424f, -0.0012809046f, 0.03563469f, 0.12648113f, 0.027195795f,
+ 0.35373217f, -0.018957434f, 0.008907322f, -0.0762701f, 0.12018895f,
+ 0.04216877f, 0.0022856654f, 0.040952638f, 0.3147856f, 0.08225149f,
+ -0.057416286f, -0.14995944f, -0.008040261f, 0.13208859f, 0.029760877f
+ });
+
+ auto recurrentToInputWeights =
+ MakeTensor<float, 2>(tensorInfo20x16, {-0.001374326f, -0.078856036f, 0.10672688f, 0.029162422f,
+ -0.11585556f, 0.02557986f, -0.13446963f, -0.035785314f,
+ -0.01244275f, 0.025961924f, -0.02337298f, -0.044228926f,
+ -0.055839065f, -0.046598054f, -0.010546039f, -0.06900766f,
+ 0.027239809f, 0.022582639f, -0.013296484f, -0.05459212f,
+ 0.08981f, -0.045407712f, 0.08682226f, -0.06867011f,
+ -0.14390695f, -0.02916037f, 0.000996957f, 0.091420636f,
+ 0.14283475f, -0.07390571f, -0.06402044f, 0.062524505f,
+ -0.093129106f, 0.04860203f, -0.08364217f, -0.08119002f,
+ 0.009352075f, 0.22920375f, 0.0016303885f, 0.11583097f,
+ -0.13732095f, 0.012405723f, -0.07551853f, 0.06343048f,
+ 0.12162708f, -0.031923793f, -0.014335606f, 0.01790974f,
+ -0.10650317f, -0.0724401f, 0.08554849f, -0.05727212f,
+ 0.06556731f, -0.042729504f, -0.043227166f, 0.011683251f,
+ -0.013082158f, -0.029302018f, -0.010899579f, -0.062036745f,
+ -0.022509435f, -0.00964907f, -0.01567329f, 0.04260106f,
+ -0.07787477f, -0.11576462f, 0.017356863f, 0.048673786f,
+ -0.017577527f, -0.05527947f, -0.082487635f, -0.040137455f,
+ -0.10820036f, -0.04666372f, 0.022746278f, -0.07851417f,
+ 0.01068115f, 0.032956902f, 0.022433773f, 0.0026891115f,
+ 0.08944216f, -0.0685835f, 0.010513544f, 0.07228705f,
+ 0.02032331f, -0.059686817f, -0.0005566496f, -0.086984694f,
+ 0.040414046f, -0.1380399f, 0.094208956f, -0.05722982f,
+ 0.012092817f, -0.04989123f, -0.086576f, -0.003399834f,
+ -0.04696032f, -0.045747425f, 0.10091314f, 0.048676282f,
+ -0.029037097f, 0.031399418f, -0.0040285117f, 0.047237843f,
+ 0.09504992f, 0.041799378f, -0.049185462f, -0.031518843f,
+ -0.10516937f, 0.026374253f, 0.10058866f, -0.0033195973f,
+ -0.041975245f, 0.0073591834f, 0.0033782164f, -0.004325073f,
+ -0.10167381f, 0.042500053f, -0.01447153f, 0.06464186f,
+ -0.017142897f, 0.03312627f, 0.009205989f, 0.024138335f,
+ -0.011337001f, 0.035530265f, -0.010912711f, 0.0706555f,
+ -0.005894094f, 0.051841937f, -0.1401738f, -0.02351249f,
+ 0.0365468f, 0.07590991f, 0.08838724f, 0.021681072f,
+ -0.10086113f, 0.019608743f, -0.06195883f, 0.077335775f,
+ 0.023646897f, -0.095322326f, 0.02233014f, 0.09756986f,
+ -0.048691444f, -0.009579111f, 0.07595467f, 0.11480546f,
+ -0.09801813f, 0.019894179f, 0.08502348f, 0.004032281f,
+ 0.037211012f, 0.068537936f, -0.048005626f, -0.091520436f,
+ -0.028379958f, -0.01556313f, 0.06554592f, -0.045599163f,
+ -0.01672207f, -0.020169014f, -0.011877351f, -0.20212261f,
+ 0.010889619f, 0.0047078193f, 0.038385306f, 0.08540671f,
+ -0.017140968f, -0.0035865551f, 0.016678626f, 0.005633034f,
+ 0.015963363f, 0.00871737f, 0.060130805f, 0.028611384f,
+ 0.10109069f, -0.015060172f, -0.07894427f, 0.06401885f,
+ 0.011584063f, -0.024466386f, 0.0047652307f, -0.09041358f,
+ 0.030737216f, -0.0046374933f, 0.14215417f, -0.11823516f,
+ 0.019899689f, 0.006106124f, -0.027092824f, 0.0786356f,
+ 0.05052217f, -0.058925f, -0.011402121f, -0.024987547f,
+ -0.0013661642f, -0.06832946f, -0.015667673f, -0.1083353f,
+ -0.00096863037f, -0.06988685f, -0.053350925f, -0.027275559f,
+ -0.033664223f, -0.07978348f, -0.025200296f, -0.017207067f,
+ -0.058403496f, -0.055697463f, 0.005798788f, 0.12965427f,
+ -0.062582195f, 0.0013350133f, -0.10482091f, 0.0379771f,
+ 0.072521195f, -0.0029455067f, -0.13797039f, -0.03628521f,
+ 0.013806405f, -0.017858358f, -0.01008298f, -0.07700066f,
+ -0.017081132f, 0.019358726f, 0.0027079724f, 0.004635139f,
+ 0.062634714f, -0.02338735f, -0.039547626f, -0.02050681f,
+ 0.03385117f, -0.083611414f, 0.002862572f, -0.09421313f,
+ 0.058618143f, -0.08598433f, 0.00972939f, 0.023867095f,
+ -0.053934585f, -0.023203006f, 0.07452513f, -0.048767887f,
+ -0.07314807f, -0.056307215f, -0.10433547f, -0.06440842f,
+ 0.04328182f, 0.04389765f, -0.020006588f, -0.09076438f,
+ -0.11652589f, -0.021705797f, 0.03345259f, -0.010329105f,
+ -0.025767034f, 0.013057034f, -0.07316461f, -0.10145612f,
+ 0.06358255f, 0.18531723f, 0.07759293f, 0.12006465f,
+ 0.1305557f, 0.058638252f, -0.03393652f, 0.09622831f,
+ -0.16253184f, -2.4580743e-06f, 0.079869635f, -0.070196845f,
+ -0.005644518f, 0.06857898f, -0.12598175f, -0.035084512f,
+ 0.03156317f, -0.12794146f, -0.031963028f, 0.04692781f,
+ 0.030070418f, 0.0071660685f, -0.095516115f, -0.004643372f,
+ 0.040170413f, -0.062104587f, -0.0037324072f, 0.0554317f,
+ 0.08184801f, -0.019164372f, 0.06791302f, 0.034257166f,
+ -0.10307039f, 0.021943003f, 0.046745934f, 0.0790918f,
+ -0.0265588f, -0.007824208f, 0.042546265f, -0.00977924f,
+ -0.0002440307f, -0.017384544f, -0.017990116f, 0.12252321f,
+ -0.014512694f, -0.08251313f, 0.08861942f, 0.13589665f,
+ 0.026351685f, 0.012641483f, 0.07466548f, 0.044301085f,
+ -0.045414884f, -0.051112458f, 0.03444247f, -0.08502782f,
+ -0.04106223f, -0.028126027f, 0.028473156f, 0.10467447f
+ });
+
+ auto recurrentToForgetWeights =
+ MakeTensor<float, 2>(tensorInfo20x16, {-0.057784554f, -0.026057621f, -0.068447545f, -0.022581743f,
+ 0.14811787f, 0.10826372f, 0.09471067f, 0.03987225f,
+ -0.0039523416f, 0.00030638507f, 0.053185795f, 0.10572994f,
+ 0.08414449f, -0.022036452f, -0.00066928595f, -0.09203576f,
+ 0.032950465f, -0.10985798f, -0.023809856f, 0.0021431844f,
+ -0.02196096f, -0.00326074f, 0.00058621005f, -0.074678116f,
+ -0.06193199f, 0.055729095f, 0.03736828f, 0.020123724f,
+ 0.061878487f, -0.04729229f, 0.034919553f, -0.07585433f,
+ -0.04421272f, -0.044019096f, 0.085488975f, 0.04058006f,
+ -0.06890133f, -0.030951202f, -0.024628663f, -0.07672815f,
+ 0.034293607f, 0.08556707f, -0.05293577f, -0.033561368f,
+ -0.04899627f, 0.0241671f, 0.015736353f, -0.095442444f,
+ -0.029564252f, 0.016493602f, -0.035026584f, 0.022337519f,
+ -0.026871363f, 0.004780428f, 0.0077918363f, -0.03601621f,
+ 0.016435321f, -0.03263031f, -0.09543275f, -0.047392778f,
+ 0.013454138f, 0.028934088f, 0.01685226f, -0.086110644f,
+ -0.046250615f, -0.01847454f, 0.047608484f, 0.07339695f,
+ 0.034546845f, -0.04881143f, 0.009128804f, -0.08802852f,
+ 0.03761666f, 0.008096139f, -0.014454086f, 0.014361001f,
+ -0.023502491f, -0.0011840804f, -0.07607001f, 0.001856849f,
+ -0.06509276f, -0.006021153f, -0.08570962f, -0.1451793f,
+ 0.060212336f, 0.055259194f, 0.06974018f, 0.049454916f,
+ -0.027794661f, -0.08077226f, -0.016179763f, 0.1169753f,
+ 0.17213494f, -0.0056326236f, -0.053934924f, -0.0124349f,
+ -0.11520337f, 0.05409887f, 0.088759385f, 0.0019655675f,
+ 0.0042065294f, 0.03881498f, 0.019844765f, 0.041858196f,
+ -0.05695512f, 0.047233116f, 0.038937137f, -0.06542224f,
+ 0.014429736f, -0.09719407f, 0.13908425f, -0.05379757f,
+ 0.012321099f, 0.082840554f, -0.029899208f, 0.044217527f,
+ 0.059855383f, 0.07711018f, -0.045319796f, 0.0948846f,
+ -0.011724666f, -0.0033288454f, -0.033542685f, -0.04764985f,
+ -0.13873616f, 0.040668588f, 0.034832682f, -0.015319203f,
+ -0.018715994f, 0.046002675f, 0.0599172f, -0.043107376f,
+ 0.0294216f, -0.002314414f, -0.022424703f, 0.0030315618f,
+ 0.0014641669f, 0.0029166266f, -0.11878115f, 0.013738511f,
+ 0.12375372f, -0.0006038222f, 0.029104086f, 0.087442465f,
+ 0.052958444f, 0.07558703f, 0.04817258f, 0.044462286f,
+ -0.015213451f, -0.08783778f, -0.0561384f, -0.003008196f,
+ 0.047060397f, -0.002058388f, 0.03429439f, -0.018839769f,
+ 0.024734668f, 0.024614193f, -0.042046934f, 0.09597743f,
+ -0.0043254104f, 0.04320769f, 0.0064070094f, -0.0019131786f,
+ -0.02558259f, -0.022822596f, -0.023273505f, -0.02464396f,
+ -0.10991725f, -0.006240552f, 0.0074488563f, 0.024044557f,
+ 0.04383914f, -0.046476185f, 0.028658995f, 0.060410924f,
+ 0.050786525f, 0.009452605f, -0.0073054377f, -0.024810238f,
+ 0.0052906186f, 0.0066939713f, -0.0020913032f, 0.014515517f,
+ 0.015898481f, 0.021362653f, -0.030262267f, 0.016587038f,
+ -0.011442813f, 0.041154444f, -0.007631438f, -0.03423484f,
+ -0.010977775f, 0.036152758f, 0.0066366293f, 0.11915515f,
+ 0.02318443f, -0.041350313f, 0.021485701f, -0.10906167f,
+ -0.028218046f, -0.00954771f, 0.020531068f, -0.11995105f,
+ -0.03672871f, 0.024019798f, 0.014255957f, -0.05221243f,
+ -0.00661567f, -0.04630967f, 0.033188973f, 0.10107534f,
+ -0.014027541f, 0.030796422f, -0.10270911f, -0.035999842f,
+ 0.15443139f, 0.07684145f, 0.036571592f, -0.035900835f,
+ -0.0034699554f, 0.06209149f, 0.015920248f, -0.031122351f,
+ -0.03858649f, 0.01849943f, 0.13872518f, 0.01503974f,
+ 0.069941424f, -0.06948533f, -0.0088794185f, 0.061282158f,
+ -0.047401894f, 0.03100163f, -0.041533746f, -0.10430945f,
+ 0.044574402f, -0.01425562f, -0.024290353f, 0.034563623f,
+ 0.05866852f, 0.023947537f, -0.09445152f, 0.035450947f,
+ 0.02247216f, -0.0042998926f, 0.061146557f, -0.10250651f,
+ 0.020881841f, -0.06747029f, 0.10062043f, -0.0023941975f,
+ 0.03532124f, -0.016341697f, 0.09685456f, -0.016764693f,
+ 0.051808182f, 0.05875331f, -0.04536488f, 0.001626336f,
+ -0.028892258f, -0.01048663f, -0.009793449f, -0.017093895f,
+ 0.010987891f, 0.02357273f, -0.00010856845f, 0.0099760275f,
+ -0.001845119f, -0.03551521f, 0.0018358806f, 0.05763657f,
+ -0.01769146f, 0.040995963f, 0.02235177f, -0.060430344f,
+ 0.11475477f, -0.023854522f, 0.10071741f, 0.0686208f,
+ -0.014250481f, 0.034261297f, 0.047418304f, 0.08562733f,
+ -0.030519066f, 0.0060542435f, 0.014653856f, -0.038836084f,
+ 0.04096551f, 0.032249358f, -0.08355519f, -0.026823482f,
+ 0.056386515f, -0.010401743f, -0.028396193f, 0.08507674f,
+ 0.014410365f, 0.020995233f, 0.17040324f, 0.11511526f,
+ 0.02459721f, 0.0066619175f, 0.025853224f, -0.023133837f,
+ -0.081302024f, 0.017264642f, -0.009585969f, 0.09491168f,
+ -0.051313367f, 0.054532815f, -0.014298593f, 0.10657464f,
+ 0.007076659f, 0.10964551f, 0.0409152f, 0.008275321f,
+ -0.07283536f, 0.07937492f, 0.04192024f, -0.1075027f
+ });
+
+ auto recurrentToCellWeights =
+ MakeTensor<float, 2>(tensorInfo20x16, {-0.037322544f, 0.018592842f, 0.0056175636f, -0.06253426f,
+ 0.055647098f, -0.05713207f, -0.05626563f, 0.005559383f,
+ 0.03375411f, -0.025757805f, -0.088049285f, 0.06017052f,
+ -0.06570978f, 0.007384076f, 0.035123326f, -0.07920549f,
+ 0.053676967f, 0.044480428f, -0.07663568f, 0.0071805613f,
+ 0.08089997f, 0.05143358f, 0.038261272f, 0.03339287f,
+ -0.027673481f, 0.044746667f, 0.028349208f, 0.020090483f,
+ -0.019443132f, -0.030755889f, -0.0040000007f, 0.04465846f,
+ -0.021585021f, 0.0031670958f, 0.0053199246f, -0.056117613f,
+ -0.10893326f, 0.076739706f, -0.08509834f, -0.027997585f,
+ 0.037871376f, 0.01449768f, -0.09002357f, -0.06111149f,
+ -0.046195522f, 0.0422062f, -0.005683705f, -0.1253618f,
+ -0.012925729f, -0.04890792f, 0.06985068f, 0.037654128f,
+ 0.03398274f, -0.004781977f, 0.007032333f, -0.031787455f,
+ 0.010868644f, -0.031489216f, 0.09525667f, 0.013939797f,
+ 0.0058680447f, 0.0167067f, 0.02668468f, -0.04797466f,
+ -0.048885044f, -0.12722108f, 0.035304096f, 0.06554885f,
+ 0.00972396f, -0.039238118f, -0.05159735f, -0.11329045f,
+ 0.1613692f, -0.03750952f, 0.06529313f, -0.071974665f,
+ -0.11769596f, 0.015524369f, -0.0013754242f, -0.12446318f,
+ 0.02786344f, -0.014179351f, 0.005264273f, 0.14376344f,
+ 0.015983658f, 0.03406988f, -0.06939408f, 0.040699873f,
+ 0.02111075f, 0.09669095f, 0.041345075f, -0.08316494f,
+ -0.07684199f, -0.045768797f, 0.032298047f, -0.041805092f,
+ 0.0119405f, 0.0061010392f, 0.12652606f, 0.0064572375f,
+ -0.024950314f, 0.11574242f, 0.04508852f, -0.04335324f,
+ 0.06760663f, -0.027437469f, 0.07216407f, 0.06977076f,
+ -0.05438599f, 0.034033038f, -0.028602652f, 0.05346137f,
+ 0.043184172f, -0.037189785f, 0.10420091f, 0.00882477f,
+ -0.054019816f, -0.074273005f, -0.030617684f, -0.0028467078f,
+ 0.024302477f, -0.0038869337f, 0.005332455f, 0.0013399826f,
+ 0.04361412f, -0.007001822f, 0.09631092f, -0.06702025f,
+ -0.042049985f, -0.035070654f, -0.04103342f, -0.10273396f,
+ 0.0544271f, 0.037184782f, -0.13150354f, -0.0058036847f,
+ -0.008264958f, 0.042035464f, 0.05891794f, 0.029673764f,
+ 0.0063542654f, 0.044788733f, 0.054816857f, 0.062257513f,
+ -0.00093483756f, 0.048938446f, -0.004952862f, -0.007730018f,
+ -0.04043371f, -0.017094059f, 0.07229206f, -0.023670016f,
+ -0.052195564f, -0.025616996f, -0.01520939f, 0.045104615f,
+ -0.007376126f, 0.003533447f, 0.006570588f, 0.056037236f,
+ 0.12436656f, 0.051817212f, 0.028532185f, -0.08686856f,
+ 0.11868599f, 0.07663395f, -0.07323171f, 0.03463402f,
+ -0.050708205f, -0.04458982f, -0.11590894f, 0.021273347f,
+ 0.1251325f, -0.15313013f, -0.12224372f, 0.17228661f,
+ 0.023029093f, 0.086124025f, 0.006445803f, -0.03496501f,
+ 0.028332196f, 0.04449512f, -0.042436164f, -0.026587414f,
+ -0.006041347f, -0.09292539f, -0.05678812f, 0.03897832f,
+ 0.09465633f, 0.008115513f, -0.02171956f, 0.08304309f,
+ 0.071401566f, 0.019622514f, 0.032163795f, -0.004167056f,
+ 0.02295182f, 0.030739572f, 0.056506045f, 0.004612461f,
+ 0.06524936f, 0.059999723f, 0.046395954f, -0.0045512207f,
+ -0.1335546f, -0.030136576f, 0.11584653f, -0.014678886f,
+ 0.0020118146f, -0.09688814f, -0.0790206f, 0.039770417f,
+ -0.0329582f, 0.07922767f, 0.029322514f, 0.026405897f,
+ 0.04207835f, -0.07073373f, 0.063781224f, 0.0859677f,
+ -0.10925287f, -0.07011058f, 0.048005477f, 0.03438226f,
+ -0.09606514f, -0.006669445f, -0.043381985f, 0.04240257f,
+ -0.06955775f, -0.06769346f, 0.043903265f, -0.026784198f,
+ -0.017840602f, 0.024307009f, -0.040079936f, -0.019946516f,
+ 0.045318738f, -0.12233574f, 0.026170589f, 0.0074471775f,
+ 0.15978073f, 0.10185836f, 0.10298046f, -0.015476589f,
+ -0.039390966f, -0.072174534f, 0.0739445f, -0.1211869f,
+ -0.0347889f, -0.07943156f, 0.014809798f, -0.12412325f,
+ -0.0030663363f, 0.039695457f, 0.0647603f, -0.08291318f,
+ -0.018529687f, -0.004423833f, 0.0037507233f, 0.084633216f,
+ -0.01514876f, -0.056505352f, -0.012800942f, -0.06994386f,
+ 0.012962922f, -0.031234352f, 0.07029052f, 0.016418684f,
+ 0.03618972f, 0.055686004f, -0.08663945f, -0.017404709f,
+ -0.054761406f, 0.029065743f, 0.052404847f, 0.020238016f,
+ 0.0048197987f, -0.0214882f, 0.07078733f, 0.013016777f,
+ 0.06262858f, 0.009184685f, 0.020785125f, -0.043904778f,
+ -0.0270329f, -0.03299152f, -0.060088247f, -0.015162964f,
+ -0.001828936f, 0.12642565f, -0.056757294f, 0.013586685f,
+ 0.09232601f, -0.035886683f, 0.06000002f, 0.05229691f,
+ -0.052580316f, -0.082029596f, -0.010794592f, 0.012947712f,
+ -0.036429964f, -0.085508935f, -0.13127148f, -0.017744139f,
+ 0.031502828f, 0.036232427f, -0.031581745f, 0.023051167f,
+ -0.05325106f, -0.03421577f, 0.028793324f, -0.034633752f,
+ -0.009881397f, -0.043551125f, -0.018609839f, 0.0019097115f,
+ -0.008799762f, 0.056595087f, 0.0022273948f, 0.055752404f
+ });
+
+ auto recurrentToOutputWeights =
+ MakeTensor<float, 2>(tensorInfo20x16, {0.025825322f, -0.05813119f, 0.09495884f,-0.045984812f, -0.01255415f,
+ -0.0026479573f,-0.08196161f,-0.054914974f,-0.0046604523f,
+ -0.029587349f, -0.044576716f, -0.07480124f, -0.082868785f,
+ 0.023254942f, 0.027502948f, -0.0039728214f, -0.08683098f,
+ -0.08116779f, -0.014675607f, -0.037924774f, -0.023314456f,
+ -0.007401714f, -0.09255757f, 0.029460307f, -0.08829125f,
+ -0.005139627f, -0.08989442f, -0.0555066f, 0.13596267f,
+ -0.025062224f, -0.048351806f, -0.03850004f, 0.07266485f,
+ -0.022414139f, 0.05940088f, 0.075114764f, 0.09597592f,
+ -0.010211725f, -0.0049794707f, -0.011523867f, -0.025980417f,
+ 0.072999895f, 0.11091378f, -0.081685916f, 0.014416728f,
+ 0.043229222f, 0.034178585f, -0.07530371f, 0.035837382f,
+ -0.085607f, -0.007721233f, -0.03287832f, -0.043848954f,
+ -0.06404588f, -0.06632928f, -0.073643476f, 0.008214239f,
+ -0.045984086f, 0.039764922f, 0.03474462f, 0.060612556f,
+ -0.080590084f, 0.049127717f, 0.04151091f, -0.030063879f,
+ 0.008801774f, -0.023021035f, -0.019558564f, 0.05158114f,
+ -0.010947698f, -0.011825728f, 0.0075720972f, 0.0699727f,
+ -0.0039981045f, 0.069350146f, 0.08799282f, 0.016156472f,
+ 0.035502106f, 0.11695009f, 0.006217345f, 0.13392477f,
+ -0.037875112f, 0.025745004f, 0.08940699f, -0.00924166f,
+ 0.0046702605f, -0.036598757f, -0.08811812f, 0.10522024f,
+ -0.032441203f, 0.008176899f, -0.04454919f, 0.07058152f,
+ 0.0067963637f, 0.039206743f, 0.03259838f, 0.03725492f,
+ -0.09515802f, 0.013326398f, -0.052055415f, -0.025676316f,
+ 0.03198509f, -0.015951829f, -0.058556724f, 0.036879618f,
+ 0.043357447f, 0.028362012f, -0.05908629f, 0.0059240665f,
+ -0.04995891f, -0.019187413f,0.0276265f, -0.01628143f, 0.0025863599f,
+ 0.08800015f, 0.035250366f, -0.022165963f, -0.07328642f,
+ -0.009415526f, -0.07455109f, 0.11690406f, 0.0363299f,
+ 0.07411125f, 0.042103454f, -0.009660886f, 0.019076364f,
+ 0.018299393f, -0.046004917f, 0.08891175f,0.0431396f, -0.026327137f,
+ -0.051502608f, 0.08979574f, -0.051670972f, 0.04940282f,
+ -0.07491107f, -0.021240504f, 0.022596184f, -0.034280192f,
+ 0.060163025f, -0.058211457f, -0.051837247f, -0.01349775f,
+ -0.04639988f, -0.035936575f, -0.011681591f, 0.064818054f,
+ 0.0073146066f, -0.021745546f, -0.043124277f, -0.06471268f,
+ -0.07053354f, -0.029321948f, -0.05330136f, 0.016933719f,
+ -0.053782392f, 0.13747959f, -0.1361751f, -0.11569455f,
+ 0.0033329215f, 0.05693899f, -0.053219706f, 0.063698f,
+ 0.07977434f, -0.07924483f, 0.06936997f, 0.0034815092f,
+ -0.007305279f, -0.037325785f, -0.07251102f, -0.033633437f,
+ -0.08677009f, 0.091591336f, -0.14165086f, 0.021752775f,
+ 0.019683983f, 0.0011612234f, -0.058154266f, 0.049996935f,
+ 0.0288841f, -0.0024567875f, -0.14345716f, 0.010955264f,-0.10234828f,
+ 0.1183656f, -0.0010731248f, -0.023590032f,-0.072285876f,-0.0724771f,
+ -0.026382286f, -0.0014920527f, 0.042667855f, 0.0018776858f,
+ 0.02986552f, 0.009814309f, 0.0733756f, 0.12289186f,
+ 0.018043943f, -0.0458958f, 0.049412545f, 0.033632483f,
+ 0.05495232f, 0.036686596f, -0.013781798f, -0.010036754f,
+ 0.02576849f, -0.08307328f, 0.010112348f, 0.042521734f,
+ -0.05869831f, -0.071689695f, 0.03876447f, -0.13275425f, -0.0352966f,
+ -0.023077697f, 0.10285965f, 0.084736146f, 0.15568255f,
+ -0.00040734606f, 0.027835453f, -0.10292561f, -0.032401145f,
+ 0.10053256f, -0.026142767f, -0.08271222f, -0.0030240538f,
+ -0.016368777f, 0.1070414f, 0.042672627f, 0.013456989f,
+ -0.0437609f, -0.022309763f, 0.11576483f, 0.04108048f,
+ 0.061026827f, -0.0190714f, -0.0869359f, 0.037901703f, 0.0610107f,
+ 0.07202949f, 0.01675338f, 0.086139716f, -0.08795751f,
+ -0.014898893f, -0.023771819f, -0.01965048f, 0.007955471f,
+ -0.043740474f, 0.03346837f, -0.10549954f, 0.090567775f,
+ 0.042013682f, -0.03176985f, 0.12569028f, -0.02421228f,
+ -0.029526481f, 0.023851605f, 0.031539805f, 0.05292009f,
+ -0.02344001f, -0.07811758f, -0.08834428f, 0.10094801f,
+ 0.16594367f, -0.06861939f, -0.021256343f, -0.041093912f,
+ -0.06669611f, 0.035498552f, 0.021757556f, -0.09302526f,
+ -0.015403468f, -0.06614931f, -0.051798206f, -0.013874718f,
+ 0.03630673f, 0.010412845f, -0.08077351f, 0.046185967f,
+ 0.0035662893f, 0.03541868f, -0.094149634f, -0.034814864f,
+ 0.003128424f, -0.020674974f, -0.03944324f, -0.008110165f,
+ -0.11113267f, 0.08484226f, 0.043586485f, 0.040582247f,
+ 0.0968012f, -0.065249965f, -0.028036479f, 0.0050708856f,
+ 0.0017462453f, 0.0326779f, 0.041296225f, 0.09164146f,
+ -0.047743853f, -0.015952192f, -0.034451712f, 0.084197424f,
+ -0.05347844f, -0.11768019f, 0.085926116f, -0.08251791f,
+ -0.045081906f, 0.0948852f, 0.068401024f, 0.024856757f,
+ 0.06978981f, -0.057309967f, -0.012775832f, -0.0032452994f,
+ 0.01977615f, -0.041040014f, -0.024264973f,0.063464895f, 0.05431621f
+ });
+
+ auto cellToInputWeights =
+ MakeTensor<float, 1>(tensorInfo20, {0.040369894f, 0.030746894f, 0.24704495f, 0.018586371f, -0.037586458f,
+ -0.15312155f, -0.11812848f, -0.11465643f, 0.20259799f, 0.11418174f,
+ -0.10116027f, -0.011334949f, 0.12411352f, -0.076769054f,-0.052169047f,
+ 0.21198851f, -0.38871562f, -0.09061183f, -0.09683246f, -0.21929175f
+ });
+
+
+ auto cellToForgetWeights =
+ MakeTensor<float, 1>(tensorInfo20, {-0.01998659f,-0.15568835f,-0.24248174f, -0.012770197f, 0.041331276f,
+ -0.072311886f, -0.052123554f,-0.0066330447f,-0.043891653f,0.036225766f,
+ -0.047248036f, 0.021479502f,0.033189066f, 0.11952997f, -0.020432774f,
+ 0.64658105f, -0.06650122f, -0.03467612f, 0.095340036f, 0.23647355f
+ });
+
+ auto cellToOutputWeights =
+ MakeTensor<float, 1>(tensorInfo20, {0.08286371f, -0.08261836f, -0.51210177f, 0.002913762f, 0.17764764f,
+ -0.5495371f, -0.08460716f, -0.24552552f, 0.030037103f, 0.04123544f,
+ -0.11940523f, 0.007358328f, 0.1890978f, 0.4833202f, -0.34441817f,
+ 0.36312827f, -0.26375428f, 0.1457655f, -0.19724406f, 0.15548733f
+ });
+
+ auto projectionWeights =
+ MakeTensor<float, 2>(tensorInfo16x20,
+ {-0.009802181f, 0.09401916f, 0.0717386f, -0.13895074f, 0.09641832f,
+ 0.060420845f, 0.08539281f, 0.054285463f, 0.061395317f, 0.034448683f,
+ -0.042991187f, 0.019801661f, -0.16840284f, -0.015726732f, -0.23041931f,
+ -0.024478018f, -0.10959692f, -0.013875541f, 0.18600968f, -0.061274476f,
+ 0.0138165f, -0.08160894f, -0.07661644f, 0.032372914f, 0.16169067f,
+ 0.22465782f, -0.03993472f, -0.004017731f, 0.08633481f, -0.28869787f,
+ 0.08682067f, 0.17240396f, 0.014975425f, 0.056431185f, 0.031037588f,
+ 0.16702051f, 0.0077946745f, 0.15140012f, 0.29405436f, 0.120285f,
+ -0.188994f, -0.027265169f, 0.043389652f, -0.022061434f, 0.014777949f,
+ -0.20203483f, 0.094781205f, 0.19100232f, 0.13987629f, -0.036132768f,
+ -0.06426278f, -0.05108664f, 0.13221376f, 0.009441198f, -0.16715929f,
+ 0.15859416f, -0.040437475f, 0.050779544f, -0.022187516f, 0.012166504f,
+ 0.027685808f, -0.07675938f, -0.0055694645f, -0.09444123f, 0.0046453946f,
+ 0.050794356f, 0.10770313f, -0.20790008f, -0.07149004f, -0.11425117f,
+ 0.008225835f, -0.035802525f, 0.14374903f, 0.15262283f, 0.048710253f,
+ 0.1847461f, -0.007487823f, 0.11000021f, -0.09542012f, 0.22619456f,
+ -0.029149994f, 0.08527916f, 0.009043713f, 0.0042746216f, 0.016261552f,
+ 0.022461696f, 0.12689082f, -0.043589946f, -0.12035478f, -0.08361797f,
+ -0.050666027f, -0.1248618f, -0.1275799f, -0.071875185f, 0.07377272f,
+ 0.09944291f, -0.18897448f, -0.1593054f, -0.06526116f, -0.040107165f,
+ -0.004618631f, -0.067624845f, -0.007576253f, 0.10727444f, 0.041546922f,
+ -0.20424393f, 0.06907816f, 0.050412357f, 0.00724631f, 0.039827548f,
+ 0.12449835f, 0.10747581f, 0.13708383f, 0.09134148f, -0.12617786f,
+ -0.06428341f, 0.09956831f, 0.1208086f, -0.14676677f, -0.0727722f,
+ 0.1126304f, 0.010139365f, 0.015571211f, -0.038128063f, 0.022913318f,
+ -0.042050496f, 0.16842307f, -0.060597885f, 0.10531834f, -0.06411776f,
+ -0.07451711f, -0.03410368f, -0.13393489f, 0.06534304f, 0.003620307f,
+ 0.04490757f, 0.05970546f, 0.05197996f, 0.02839995f, 0.10434969f,
+ -0.013699693f, -0.028353551f, -0.07260381f, 0.047201227f, -0.024575593f,
+ -0.036445823f, 0.07155557f, 0.009672501f, -0.02328883f, 0.009533515f,
+ -0.03606021f, -0.07421458f, -0.028082801f, -0.2678904f, -0.13221288f,
+ 0.18419984f, -0.13012612f, -0.014588381f, -0.035059117f, -0.04824723f,
+ 0.07830115f, -0.056184657f, 0.03277091f, 0.025466874f, 0.14494097f,
+ -0.12522776f, -0.098633975f, -0.10766018f, -0.08317623f, 0.08594209f,
+ 0.07749552f, 0.039474737f, 0.1776665f, -0.07409566f, -0.0477268f,
+ 0.29323658f, 0.10801441f, 0.1154011f, 0.013952499f, 0.10739139f,
+ 0.10708251f, -0.051456142f, 0.0074137426f, -0.10430189f, 0.10034707f,
+ 0.045594677f, 0.0635285f, -0.0715442f, -0.089667566f, -0.10811871f,
+ 0.00026344223f, 0.08298446f, -0.009525053f, 0.006585689f, -0.24567553f,
+ -0.09450807f, 0.09648481f, 0.026996298f, -0.06419476f, -0.04752702f,
+ -0.11063944f, -0.23441927f, -0.17608605f, -0.052156363f, 0.067035615f,
+ 0.19271925f, -0.0032889997f, -0.043264326f, 0.09663576f, -0.057112187f,
+ -0.10100678f, 0.0628376f, 0.04447668f, 0.017961001f, -0.10094388f,
+ -0.10190601f, 0.18335468f, 0.10494553f, -0.052095775f, -0.0026118709f,
+ 0.10539724f, -0.04383912f, -0.042349473f, 0.08438151f, -0.1947263f,
+ 0.02251204f, 0.11216432f, -0.10307853f, 0.17351969f, -0.039091777f,
+ 0.08066188f, -0.00561982f, 0.12633002f, 0.11335965f, -0.0088127935f,
+ -0.019777594f, 0.06864014f, -0.059751723f, 0.016233567f, -0.06894641f,
+ -0.28651384f, -0.004228674f, 0.019708522f, -0.16305895f, -0.07468996f,
+ -0.0855457f, 0.099339016f, -0.07580735f, -0.13775392f, 0.08434318f,
+ 0.08330512f, -0.12131499f, 0.031935584f, 0.09180414f, -0.08876437f,
+ -0.08049874f, 0.008753825f, 0.03498998f, 0.030215185f, 0.03907079f,
+ 0.089751154f, 0.029194152f, -0.03337423f, -0.019092513f, 0.04331237f,
+ 0.04299654f, -0.036394123f, -0.12915532f, 0.09793732f, 0.07512415f,
+ -0.11319543f, -0.032502122f, 0.15661901f, 0.07671967f, -0.005491124f,
+ -0.19379048f, -0.218606f, 0.21448623f, 0.017840758f, 0.1416943f,
+ -0.07051762f, 0.19488361f, 0.02664691f, -0.18104725f, -0.09334311f,
+ 0.15026465f, -0.15493552f, -0.057762887f, -0.11604192f, -0.262013f,
+ -0.01391798f, 0.012185008f, 0.11156489f, -0.07483202f, 0.06693364f,
+ -0.26151478f, 0.046425626f, 0.036540434f, -0.16435726f, 0.17338543f,
+ -0.21401681f, -0.11385144f, -0.08283257f, -0.069031075f, 0.030635102f,
+ 0.010969227f, 0.11109743f, 0.010919218f, 0.027526086f, 0.13519906f,
+ 0.01891392f, -0.046839405f, -0.040167913f, 0.017953383f, -0.09700955f,
+ 0.0061885654f, -0.07000971f, 0.026893595f, -0.038844477f, 0.14543656f
+ });
+
+ std::vector<float> projectionBiasVector(outputSize, 0.f);
+ auto projectionBias = MakeTensor<float,1>(tensorInfo16, projectionBiasVector);
+
+ armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo20x5);
+ armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo20x5);
+ armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo20x5);
+ armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo20x5);
+ armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo20x16);
+ armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo20x16);
+ armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo20x16);
+ armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo20x16);
+ armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo20);
+ armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo20);
+ armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo20);
+ armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo20);
+ armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo20);
+ armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfo20);
+ armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfo20);
+ armnn::ScopedCpuTensorHandle projectionWeightsTensor(tensorInfo16x20);
+ armnn::ScopedCpuTensorHandle projectionBiasTensor(tensorInfo16);
+
+ AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]);
+ AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]);
+ AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+ AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+ AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+ AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]);
+ AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]);
+ AllocateAndCopyDataToITensorHandle(&projectionWeightsTensor, &projectionWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&projectionBiasTensor, &projectionBias[0]);
+
+ data.m_InputToInputWeights = &inputToInputWeightsTensor;
+ data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+ data.m_InputToCellWeights = &inputToCellWeightsTensor;
+ data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+ data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor;
+ data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+ data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+ data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+ data.m_CellToInputWeights = &cellToInputWeightsTensor;
+ data.m_InputGateBias = &inputGateBiasTensor;
+ data.m_ForgetGateBias = &forgetGateBiasTensor;
+ data.m_CellBias = &cellBiasTensor;
+ data.m_OutputGateBias = &outputGateBiasTensor;
+ data.m_CellToForgetWeights = &cellToForgetWeightsTensor;
+ data.m_CellToOutputWeights = &cellToOutputWeightsTensor;
+ data.m_ProjectionWeights = &projectionWeightsTensor;
+ data.m_ProjectionBias = &projectionBiasTensor;
+
+ // Flags to set test configuration
+ data.m_Parameters.m_ActivationFunc = 4;
+ data.m_Parameters.m_CifgEnabled = false;
+ data.m_Parameters.m_PeepholeEnabled = true;
+ data.m_Parameters.m_ProjectionEnabled = true;
+
+
+ std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info);
+ inputHandle->Allocate();
+ outputStateInHandle->Allocate();
+ cellStateInHandle->Allocate();
+
+ scratchHandle->Allocate();
+ outputStateOutHandle->Allocate();
+ cellStateOutHandle->Allocate();
+ outputHandle->Allocate();
+
+ CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+ CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+ CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+ workloadFactory.Finalize();
+ workload->Execute();
+
+ CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+
+ return ret;
+
+}
+
+
+LayerTestResult<float, 2> LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory,
+ const boost::multi_array<float, 2>& input,
+ const boost::multi_array<float, 2>& outputExpected)
+{
+ bool cifgEnabled = true;
+ bool peepholeEnabled = true;
+ bool projectionEnabled = false;
+ // These are not the input and the output of Lstm yet
+ unsigned int batchSize = boost::numeric_cast<unsigned int>(input.shape()[0]);
+ unsigned int inputSize = boost::numeric_cast<unsigned int>(input.shape()[1]);
+
+ unsigned int outputSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+
+ const unsigned int cellSize = outputSize;
+
+ // Decide the shape of all input tensors
+ armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo cellStateInTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>());
+
+ unsigned int scratchBufferSize = cifgEnabled ? cellSize * 4 : cellSize * 3;
+ armnn::TensorInfo scratchBufferTensorInfo({batchSize, scratchBufferSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo cellStateOutTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>());
+
+ // List of inputs
+ std::vector<float> inputData;
+ inputData.assign(input.data(), input.data() + batchSize*inputSize);
+ auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputData);
+
+ std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
+ auto outputStateInTensor = MakeTensor<float, 2>(outputStateInTensorInfo, outputStateInVector);
+
+ std::vector<float> cellStateInVector(batchSize * cellSize, 0.f);
+ auto cellStateInTensor = MakeTensor<float, 2>(cellStateInTensorInfo, cellStateInVector);
+
+
+ // Prepare all the weights in the descriptor for LSTM
+ armnn::LstmQueueDescriptor data;
+ armnn::TensorInfo tensorInfoInput({cellSize, inputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfoOutput({cellSize, outputSize}, armnn::GetDataType<float>());
+ armnn::TensorInfo tensorInfoNumUnits({cellSize}, armnn::GetDataType<float>());
+
+ auto inputToCellWeights = MakeTensor<float, 2>(tensorInfoInput,
+ {-0.49770179f, -0.27711356f, -0.09624726f, 0.05100781f,
+ 0.04717243f, 0.48944736f, -0.38535351f,
+ -0.17212132f});
+ auto inputToForgetWeights = MakeTensor<float, 2>(tensorInfoInput,
+ {-0.55291498f, -0.42866567f, 0.13056988f,
+ -0.3633365f, -0.22755712f, 0.28253698f, 0.24407166f,
+ 0.33826375f});
+ auto inputToOutputWeights = MakeTensor<float, 2>(tensorInfoInput,
+ {0.10725588f, -0.02335852f, -0.55932593f,
+ -0.09426838f, -0.44257352f, 0.54939759f,
+ 0.01533556f, 0.42751634f});
+ auto cellBias = MakeTensor<float, 1>(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f});
+ auto forgetGateBias = MakeTensor<float, 1>(tensorInfoNumUnits, {1.f, 1.f, 1.f, 1.f});
+ auto outputGateBias = MakeTensor<float, 1>(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f});
+
+ auto recurrentToCellWeights = MakeTensor<float, 2>(tensorInfoOutput,
+ {0.54066205f, -0.32668582f, -0.43562764f, -0.56094903f, 0.42957711f,
+ 0.01841056f, -0.32764608f, -0.33027974f, -0.10826075f, 0.20675004f,
+ 0.19069612f, -0.03026325f, -0.54532051f, 0.33003211f, 0.44901288f,
+ 0.21193194f});
+ auto recurrentToForgetWeights = MakeTensor<float, 2>(tensorInfoOutput,
+ {-0.13832897f, -0.0515101f, -0.2359007f, -0.16661474f, -0.14340827f,
+ 0.36986142f, 0.23414481f, 0.55899f, 0.10798943f, -0.41174671f, 0.17751795f,
+ -0.34484994f, -0.35874045f, -0.11352962f, 0.27268326f, 0.54058349f});
+
+ auto recurrentToOutputWeights = MakeTensor<float, 2>(tensorInfoOutput,
+ {0.41613156f, 0.42610586f, -0.16495961f, -0.5663873f, 0.30579174f, -0.05115908f,
+ -0.33941799f, 0.23364776f, 0.11178309f, 0.09481031f, -0.26424935f, 0.46261835f,
+ 0.50248802f, 0.26114327f, -0.43736315f, 0.33149987f});
+
+ auto cellToForgetWeights = MakeTensor<float, 1>(tensorInfoNumUnits,
+ {0.47485286f, -0.51955009f, -0.24458408f, 0.31544167f});
+ auto cellToOutputWeights = MakeTensor<float, 1>(tensorInfoNumUnits,
+ {-0.17135078f, 0.82760304f, 0.85573703f, -0.77109635f});
+
+ armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfoInput);
+ armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfoInput);
+ armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfoInput);
+
+ armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfoNumUnits);
+ armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfoNumUnits);
+ armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfoNumUnits);
+
+ armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfoOutput);
+ armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfoOutput);
+ armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfoOutput);
+
+
+ armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfoNumUnits);
+ armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfoNumUnits);
+
+ AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+
+ AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+ AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+ AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+
+ AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+ AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+
+ AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]);
+ AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]);
+
+
+ data.m_InputToCellWeights = &inputToCellWeightsTensor;
+ data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+ data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+
+ data.m_CellBias = &cellBiasTensor;
+ data.m_ForgetGateBias = &forgetGateBiasTensor;
+ data.m_OutputGateBias = &outputGateBiasTensor;
+
+ data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+ data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+ data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+
+ data.m_CellToForgetWeights = &cellToForgetWeightsTensor;
+ data.m_CellToOutputWeights = &cellToOutputWeightsTensor;
+
+ // other parameters for the descriptor
+ data.m_Parameters.m_CifgEnabled = cifgEnabled;
+ data.m_Parameters.m_ProjectionEnabled = projectionEnabled;
+ data.m_Parameters.m_PeepholeEnabled = peepholeEnabled;
+
+ data.m_Parameters.m_ActivationFunc = 4;
+ data.m_Parameters.m_ClippingThresProj = 0.0;
+ data.m_Parameters.m_ClippingThresCell = 0.0;
+
+
+ // List of outputs
+ std::vector<float> scratchBufferVector(batchSize * scratchBufferSize, 0.f);
+ auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
+ LayerTestResult<float, 2> ret0(scratchBufferTensorInfo);
+
+ // Output state for a certain time step
+ std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
+ auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector);
+ LayerTestResult<float, 2> ret1(outputStateOutTensorInfo);
+
+ // Cell state for a certain time step
+ std::vector<float> cellStateOutVector(batchSize * cellSize, 0.f);
+ auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector);
+ LayerTestResult<float, 2> ret2(cellStateOutTensorInfo);
+
+ // Output for a certain time step
+ std::vector<float> outputVector(batchSize * outputSize, 0.f);
+ auto outputTensor = MakeTensor<float, 2>(outputTensorInfo, outputVector);
+ std::vector<float> outputData;
+ outputData.assign(outputExpected.data(), outputExpected.data() + batchSize*outputSize);
+ LayerTestResult<float, 2> ret3(outputTensorInfo);
+ ret3.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputData);
+
+ // Prepare the inputs and outputs for the workload
+ std::unique_ptr<armnn::ITensorHandle> inputHandle =
+ workloadFactory.CreateTensorHandle(inputTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+ workloadFactory.CreateTensorHandle(outputStateInTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+ workloadFactory.CreateTensorHandle(cellStateInTensorInfo);
+
+ std::unique_ptr<armnn::ITensorHandle> scratchBufferHandle =
+ workloadFactory.CreateTensorHandle(scratchBufferTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle =
+ workloadFactory.CreateTensorHandle(outputStateOutTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+ workloadFactory.CreateTensorHandle(cellStateOutTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputHandle =
+ workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+ armnn::WorkloadInfo info;
+ AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+ AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get());
+ AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get());
+
+ AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchBufferHandle.get());
+ AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get());
+ AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get());
+ AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+ std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info);
+
+
+ inputHandle->Allocate();
+ outputStateInHandle->Allocate();
+ cellStateInHandle->Allocate();
+
+ scratchBufferHandle->Allocate();
+ outputStateOutHandle->Allocate();
+ cellStateOutHandle->Allocate();
+ outputHandle->Allocate();
+
+
+ CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+ CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+ CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+ CopyDataToITensorHandle(scratchBufferHandle.get(), &scratchBufferTensor[0][0]);
+ CopyDataToITensorHandle(outputStateOutHandle.get(), &outputStateOutTensor[0][0]);
+ CopyDataToITensorHandle(cellStateOutHandle.get(), &cellStateOutTensor[0][0]);
+
+ workloadFactory.Finalize();
+ workload->Execute();
+
+ CopyDataFromITensorHandle(&ret0.output[0][0], scratchBufferHandle.get());
+ CopyDataFromITensorHandle(&ret1.output[0][0], outputStateOutHandle.get());
+ CopyDataFromITensorHandle(&ret2.output[0][0], cellStateOutHandle.get());
+ CopyDataFromITensorHandle(&ret3.output[0][0], outputHandle.get());
+
+ return ret3;
+}
diff --git a/src/armnn/backends/test/MemCopyTests.cpp b/src/armnn/backends/test/MemCopyTests.cpp
index 32331789e9..24a951c395 100644
--- a/src/armnn/backends/test/MemCopyTests.cpp
+++ b/src/armnn/backends/test/MemCopyTests.cpp
@@ -19,6 +19,10 @@
#include "TensorCopyUtils.hpp"
#include "WorkloadTestUtils.hpp"
+#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED
+#include "../ArmComputeTensorUtils.hpp"
+#endif
+
BOOST_AUTO_TEST_SUITE(MemCopyTestSuite)
void MemCopyTest(armnn::IWorkloadFactory& srcWorkloadFactory, armnn::IWorkloadFactory& dstWorkloadFactory,
@@ -81,6 +85,26 @@ void MemCopyTest(bool withSubtensors)
MemCopyTest(srcWorkloadFactory, dstWorkloadFactory, withSubtensors);
}
+#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED
+
+BOOST_AUTO_TEST_CASE(AclTypeConversions)
+{
+ arm_compute::Strides strides(1,2,3,4);
+ armnn::TensorShape convertedStrides = armnn::armcomputetensorutils::GetStrides(strides);
+ BOOST_TEST(convertedStrides[0] == 4);
+ BOOST_TEST(convertedStrides[1] == 3);
+ BOOST_TEST(convertedStrides[2] == 2);
+ BOOST_TEST(convertedStrides[3] == 1);
+
+ arm_compute::TensorShape shape(5,6,7,8);
+ armnn::TensorShape convertedshape = armnn::armcomputetensorutils::GetShape(shape);
+ BOOST_TEST(convertedshape[0] == 8);
+ BOOST_TEST(convertedshape[1] == 7);
+ BOOST_TEST(convertedshape[2] == 6);
+ BOOST_TEST(convertedshape[3] == 5);
+}
+#endif
+
#if ARMCOMPUTECL_ENABLED
BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpu)
diff --git a/src/armnn/backends/test/NormTestImpl.hpp b/src/armnn/backends/test/NormTestImpl.hpp
index d9dc01592a..df8219ddbd 100644
--- a/src/armnn/backends/test/NormTestImpl.hpp
+++ b/src/armnn/backends/test/NormTestImpl.hpp
@@ -87,7 +87,7 @@ LayerTestResult<float,4> SimpleNormalizationTestImpl(armnn::IWorkloadFactory& wo
// When normalising within channels, the 3x3 kernel covers the entire 2x2 input at every index.
// Therefore, all output values should equal the inputs, but divided by:
// pow((kappa + (accumulatedScale * alpha)), beta)
- // ...where accumulatedScale is the sum of every element squared
+ // ...where accumulatedScale is the sum of every element squared.
float divisor[inputNum];
for(int i = 0; i < boost::numeric_cast<int>(inputNum); i++)
{
@@ -139,7 +139,7 @@ LayerTestResult<float,4> SimpleNormalizationTestImpl(armnn::IWorkloadFactory& wo
}
break;
}
- case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough
+ case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough.
default:
{
throw armnn::UnimplementedException("Unsupported normalisation method type, "
diff --git a/src/armnn/backends/test/Pooling2dTestImpl.hpp b/src/armnn/backends/test/Pooling2dTestImpl.hpp
index ab9fd6d6fb..e6e0e6721a 100644
--- a/src/armnn/backends/test/Pooling2dTestImpl.hpp
+++ b/src/armnn/backends/test/Pooling2dTestImpl.hpp
@@ -155,21 +155,21 @@ LayerTestResult<T, 4> SimpleMaxPooling2dSize3x3Stride2x4TestCommon(armnn::IWorkl
3.0f, 5.0f, 4.0f, 0.0f, 1.0f, 5.0f, 9.0f, 7.0f,
});
- // Construct input data
+ // Constructs input data.
std::vector<float> inputData;
auto negator = [](float f) { return -f; };
- // First image (two channels where the second channel is the negative of the first one)
+ // First image (two channels where the second channel is the negative of the first one).
inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end());
std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator);
- // Second image (same as first image)
+ // Second image (same as first image).
inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end());
std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator);
auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
- // these were calculated manually
+ // These were calculated manually.
auto shape(GetTensorShapeAsArray<4>(outputTensorInfo));
boost::multi_array<T, 4> outputExpected(shape);
if (forceNoPadding)
@@ -527,13 +527,13 @@ LayerTestResult<T, 4> AsymmetricNonSquarePooling2dTestCommon(armnn::IWorkloadFac
descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
- // Construct input data
+ // Construct input data.
auto input = MakeTensor<T, 4>(inputTensorInfo,
QuantizedVector<T>(qScale, qOffset, {
1.0f, 3.0f, 4.0f,
}));
- // these were calculated manually
+ // These were calculated manually.
auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
QuantizedVector<T>(qScale, qOffset, {
0.0f, 3.0f, 0.0f, 3.0f,
@@ -686,7 +686,7 @@ LayerTestResult<T, 4> SimpleMaxPooling2dSize2x2Stride2x2TestCommon(armnn::IWorkl
438.0f, 564.0f, 573.0f, 402.0f
};
- // Note that left and right edges will be 0.f, due to the 2x2 max pooling only accessing zeros here
+ // Note that left and right edges will be 0.f, due to the 2x2 max pooling only accessing zeros here.
std::vector<float> expectedOutputDataWithPadding = {
0.0f, 510.0f, 780.0f, 654.0f, 0.0f,
0.0f, 438.0f, 618.0f, 402.0f, 0.0f
diff --git a/src/armnn/backends/test/QuantizeHelper.hpp b/src/armnn/backends/test/QuantizeHelper.hpp
index bfaf9342f0..0a6ceb761d 100644
--- a/src/armnn/backends/test/QuantizeHelper.hpp
+++ b/src/armnn/backends/test/QuantizeHelper.hpp
@@ -61,7 +61,7 @@ struct IsFloatingPointIterator
};
template <typename T, typename FloatIt,
-typename std::enable_if<IsFloatingPointIterator<FloatIt>::value, int>::type=0 // Make sure valid fp iterator
+typename std::enable_if<IsFloatingPointIterator<FloatIt>::value, int>::type=0 // Makes sure fp iterator is valid.
>
std::vector<T> QuantizedVector(float qScale, int32_t qOffset, FloatIt first, FloatIt last)
{
diff --git a/src/armnn/backends/test/Reference.cpp b/src/armnn/backends/test/Reference.cpp
index b60483a4d9..dedeb50e33 100644
--- a/src/armnn/backends/test/Reference.cpp
+++ b/src/armnn/backends/test/Reference.cpp
@@ -127,25 +127,8 @@ ARMNN_AUTO_TEST_CASE(FullyConnectedLarge, FullyConnectedLargeTest, false)
ARMNN_AUTO_TEST_CASE(FullyConnectedLargeTransposed, FullyConnectedLargeTest, true)
// Splitter
-BOOST_AUTO_TEST_CASE(SimpleSplitter)
-{
- armnn::RefWorkloadFactory workloadFactory;
- auto testResult = SplitterTest(workloadFactory);
- for (unsigned int i = 0; i < testResult.size(); ++i)
- {
- BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected));
- }
-}
-
-BOOST_AUTO_TEST_CASE(SplitterUint8)
-{
- armnn::RefWorkloadFactory workloadFactory;
- auto testResult = SplitterUint8Test(workloadFactory);
- for (unsigned int i = 0; i < testResult.size(); ++i)
- {
- BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected));
- }
-}
+ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest)
+ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test)
ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest)
ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test)
@@ -242,4 +225,9 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test)
ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test)
ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test)
+// Convert from Float16 to Float32
+ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test)
+// Convert from Float32 to Float16
+ARMNN_AUTO_TEST_CASE(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test)
+
BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/backends/test/SoftmaxTestImpl.hpp b/src/armnn/backends/test/SoftmaxTestImpl.hpp
index 4c3e0b73dd..9ed7f603a1 100644
--- a/src/armnn/backends/test/SoftmaxTestImpl.hpp
+++ b/src/armnn/backends/test/SoftmaxTestImpl.hpp
@@ -39,7 +39,7 @@ LayerTestResult<T, 2> SimpleSoftmaxTestImpl(armnn::IWorkloadFactory& workloadFac
LayerTestResult<T, 2> ret(outputTensorInfo);
- // Each row is independently softmax'd
+ // Each row is independently softmax'd.
auto input = MakeTensor<T, 2>(inputTensorInfo, std::vector<T>(
QuantizedVector<T>(qScale, 0, {
0.f, 1.f, 0.f, 0.f,
diff --git a/src/armnn/backends/test/SplitterTestImpl.hpp b/src/armnn/backends/test/SplitterTestImpl.hpp
index 70b798eafa..48c0730fa7 100644
--- a/src/armnn/backends/test/SplitterTestImpl.hpp
+++ b/src/armnn/backends/test/SplitterTestImpl.hpp
@@ -27,35 +27,35 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo
// NOTE: Compute Library imposes a restriction that the x and y dimension (input height and width)
// cannot be split.
- // For the reasons for this see first comment on https://jira.arm.com/browse/IVGCVSW-1239
+ // For the reasons for this, see first comment on https://jira.arm.com/browse/IVGCVSW-1239
//
- // this test has therefore been recast to split the channels, then split the resulting subtensor
+ // This test has therefore been recast to split the channels, then split the resulting subtensor.
- // to take channel 0 of original output
- // and channel 0 and channel 1 of the split subtensor
+ // To take channel 0 of original output
+ // and channel 0 and channel 1 of the split subtensor.
unsigned int outputWidth1 = inputWidth;
unsigned int outputHeight1 = inputHeight;
unsigned int outputChannels1 = 1;
- // to take channel 1 and 2 of the original output
+ // To take channel 1 and 2 of the original output.
unsigned int outputWidth2 = inputWidth;
unsigned int outputHeight2 = inputHeight;
unsigned int outputChannels2 = 2;
- // Define the tensor descriptors
+ // Define the tensor descriptors.
armnn::TensorInfo inputTensorInfo({ inputChannels, inputHeight, inputWidth }, armnn::GetDataType<T>());
- // outputs of the original split
+ // Outputs of the original split.
armnn::TensorInfo outputTensorInfo1({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType<T>());
armnn::TensorInfo outputTensorInfo2({ outputChannels2, outputHeight2, outputWidth2 }, armnn::GetDataType<T>());
- // outputs of the subsequent subtensor split
+ // Outputs of the subsequent subtensor split.
armnn::TensorInfo outputTensorInfo3({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType<T>());
armnn::TensorInfo outputTensorInfo4({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType<T>());
// Set quantization parameters if the requested type is a quantized type.
- // The quantization doesn't really matter as the splitter operator doesn't dequantize/quantize
+ // The quantization doesn't really matter as the splitter operator doesn't dequantize/quantize.
if(armnn::IsQuantizedType<T>())
{
inputTensorInfo.SetQuantizationScale(qScale);
@@ -100,7 +100,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo
})
));
- // channel 0 of the original input
+ // Channel 0 of the original input.
ret1.outputExpected = MakeTensor<T, 3>(outputTensorInfo1, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
@@ -112,7 +112,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo
})
));
- // channel 1 & 2 of the original input
+ // Channel 1 & 2 of the original input.
ret2.outputExpected = MakeTensor<T, 3>(outputTensorInfo2, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
@@ -131,7 +131,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo
})
));
- // channel 0 of return 2 (i.e. channels 1 and 2 of the original input)
+ // Channel 0 of return 2 (i.e. channels 1 and 2 of the original input).
ret3.outputExpected = MakeTensor<T, 3>(outputTensorInfo3, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
@@ -143,7 +143,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo
})
));
- // channel 1 of return 2
+ // Channel 1 of return 2.
ret4.outputExpected = MakeTensor<T, 3>(outputTensorInfo4, std::vector<T>(
QuantizedVector<T>(qScale, qOffset, {
61.0f, 62.0f, 63.0f, 64.0f, 65.0f,
@@ -155,19 +155,19 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo
})
));
- // NOTE: as a corollary of the no splitting of x and y restriction the x and y values of the view origins
+ // NOTE: as a corollary of the splitting of x and y restriction the x and y values of the view origins
// have to be zero, the co-ordinates are as per the tensor info above channels, height/y, width/x
- // note that under the hood the compute engine reverses these i.e. its coordinate system is x, y, channels
- std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //extent of the window is defined by size of output[0]
+ // note that under the hood the compute engine reverses these i.e. its coordinate system is x, y, channels.
+ std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of output[0].
armnn::SplitterQueueDescriptor::ViewOrigin window1(wOrigin1);
- std::vector<unsigned int> wOrigin2 = {1, 0, 0}; //extent of the window is defined by size of output[1]
+ std::vector<unsigned int> wOrigin2 = {1, 0, 0}; //Extent of the window is defined by size of output[1].
armnn::SplitterQueueDescriptor::ViewOrigin window2(wOrigin2);
- std::vector<unsigned int> wOrigin3 = {0, 0, 0}; //extent of the window is defined by size of output[2]
+ std::vector<unsigned int> wOrigin3 = {0, 0, 0}; //Extent of the window is defined by size of output[2].
armnn::SplitterQueueDescriptor::ViewOrigin window3(wOrigin3);
- std::vector<unsigned int> wOrigin4 = {1, 0, 0}; //extent of the window is defined by size of output[3]
+ std::vector<unsigned int> wOrigin4 = {1, 0, 0}; //Extent of the window is defined by size of output[3].
armnn::SplitterQueueDescriptor::ViewOrigin window4(wOrigin4);
bool subTensorsSupported = workloadFactory.SupportsSubTensors();
@@ -217,7 +217,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo
CopyDataFromITensorHandle(&ret1.output[0][0][0], outputHandle1.get());
CopyDataFromITensorHandle(&ret2.output[0][0][0], outputHandle2.get());
-// // Do the second split
+// // Do the second split.
armnn::SplitterQueueDescriptor data2;
armnn::WorkloadInfo info2;
AddInputToWorkload(data2, info2, outputTensorInfo2, outputHandle2.get());
diff --git a/src/armnn/backends/test/TensorCopyUtils.cpp b/src/armnn/backends/test/TensorCopyUtils.cpp
index e15c12a76f..82e80a52fe 100644
--- a/src/armnn/backends/test/TensorCopyUtils.cpp
+++ b/src/armnn/backends/test/TensorCopyUtils.cpp
@@ -6,6 +6,7 @@
#include <algorithm>
#include <cstring>
#include <boost/cast.hpp>
+#include <Half.hpp>
#include "TensorCopyUtils.hpp"
@@ -47,12 +48,15 @@ void CopyDataToITensorHandle(armnn::ITensorHandle* tensorHandle, const void* mem
case arm_compute::DataType::QASYMM8:
CopyArmComputeITensorData(static_cast<const uint8_t*>(mem), handle->GetTensor());
break;
+ case arm_compute::DataType::F16:
+ CopyArmComputeITensorData(static_cast<const armnn::Half*>(mem), handle->GetTensor());
+ break;
default:
{
throw armnn::UnimplementedException();
}
}
- handle->UnMap();
+ handle->Unmap();
break;
}
#endif
@@ -108,12 +112,15 @@ void CopyDataFromITensorHandle(void* mem, const armnn::ITensorHandle* tensorHand
case arm_compute::DataType::QASYMM8:
CopyArmComputeITensorData(handle->GetTensor(), static_cast<uint8_t*>(mem));
break;
+ case arm_compute::DataType::F16:
+ CopyArmComputeITensorData(handle->GetTensor(), static_cast<armnn::Half*>(mem));
+ break;
default:
{
throw armnn::UnimplementedException();
}
}
- const_cast<armnn::IClTensorHandle*>(handle)->UnMap();
+ const_cast<armnn::IClTensorHandle*>(handle)->Unmap();
break;
}
#endif
diff --git a/src/armnn/backends/test/WorkloadDataValidation.cpp b/src/armnn/backends/test/WorkloadDataValidation.cpp
index c3a9d40116..bc3898b405 100644
--- a/src/armnn/backends/test/WorkloadDataValidation.cpp
+++ b/src/armnn/backends/test/WorkloadDataValidation.cpp
@@ -22,7 +22,7 @@ BOOST_AUTO_TEST_CASE(QueueDescriptor_Validate_WrongNumOfInputsOutputs)
{
InputQueueDescriptor invalidData;
WorkloadInfo invalidInfo;
- //invalid argument exception is expected, because no inputs and no outputs were defined
+ //Invalid argument exception is expected, because no inputs and no outputs were defined.
BOOST_CHECK_THROW(RefWorkloadFactory().CreateInput(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
@@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE(RefPooling2dFloat32Workload_Validate_WrongDimTensor)
armnn::TensorInfo inputTensorInfo;
armnn::TensorInfo outputTensorInfo;
- unsigned int inputShape[] = {2, 3, 4}; // <- invalid - input tensor has to be 4D
+ unsigned int inputShape[] = {2, 3, 4}; // <- Invalid - input tensor has to be 4D.
unsigned int outputShape[] = {2, 3, 4, 5};
outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::DataType::Float32);
@@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE(RefPooling2dFloat32Workload_Validate_WrongDimTensor)
AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr);
- // invalid argument exception is expected, input tensor has to be 4D
+ // Invalid argument exception is expected, input tensor has to be 4D.
BOOST_CHECK_THROW(RefPooling2dFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
@@ -55,7 +55,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxQueueDescriptor_Validate_WrongInputHeight)
unsigned int inputNum = 2;
unsigned int outputChannels = inputChannels;
- unsigned int outputHeight = inputHeight + 1; //makes data invalid - Softmax expects height and width to be 1
+ unsigned int outputHeight = inputHeight + 1; //Makes data invalid - Softmax expects height and width to be 1.
unsigned int outputWidth = inputWidth;
unsigned int outputNum = inputNum;
@@ -74,7 +74,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxQueueDescriptor_Validate_WrongInputHeight)
AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr);
AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
- //invalid argument exception is expected, because height != 1
+ //Invalid argument exception is expected, because height != 1.
BOOST_CHECK_THROW(RefSoftmaxFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
@@ -90,7 +90,7 @@ BOOST_AUTO_TEST_CASE(FullyConnectedQueueDescriptor_Validate_RequiredDataMissing)
unsigned int outputChannels = 3;
unsigned int outputNum = 2;
- // Define the tensor descriptors
+ // Define the tensor descriptors.
armnn::TensorInfo inputTensorInfo;
armnn::TensorInfo outputTensorInfo;
armnn::TensorInfo weightsDesc;
@@ -120,8 +120,8 @@ BOOST_AUTO_TEST_CASE(FullyConnectedQueueDescriptor_Validate_RequiredDataMissing)
invalidData.m_Parameters.m_TransposeWeightMatrix = false;
- //invalid argument exception is expected, because not all required fields have been provided
- //in particular inputsData[0], outputsData[0] and weightsData can not be null
+ //Invalid argument exception is expected, because not all required fields have been provided.
+ //In particular inputsData[0], outputsData[0] and weightsData can not be null.
BOOST_CHECK_THROW(RefFullyConnectedFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
@@ -135,8 +135,8 @@ BOOST_AUTO_TEST_CASE(NormalizationQueueDescriptor_Validate_WrongInputHeight)
constexpr unsigned int outputNum = inputNum;
constexpr unsigned int outputChannels = inputChannels;
- constexpr unsigned int outputHeight = inputHeight + 1; //makes data invalid - normalization requires
- //input and output to have the same dimensions
+ constexpr unsigned int outputHeight = inputHeight + 1; //Makes data invalid - normalization requires.
+ //Input and output to have the same dimensions.
constexpr unsigned int outputWidth = inputWidth;
@@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(NormalizationQueueDescriptor_Validate_WrongInputHeight)
invalidData.m_Parameters.m_Beta = beta;
invalidData.m_Parameters.m_K = kappa;
- //invalid argument exception is expected, because input height != output height
+ //Invalid argument exception is expected, because input height != output height.
BOOST_CHECK_THROW(RefNormalizationFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
@@ -201,7 +201,7 @@ BOOST_AUTO_TEST_CASE(SplitterQueueDescriptor_Validate_WrongWindow)
AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr);
AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
- // invalid since it has only 3 dimensions while the input tensor is 4d
+ // Invalid, since it has only 3 dimensions while the input tensor is 4d.
std::vector<unsigned int> wOrigin = {0, 0, 0};
armnn::SplitterQueueDescriptor::ViewOrigin window(wOrigin);
invalidData.m_ViewOrigins.push_back(window);
@@ -210,7 +210,7 @@ BOOST_AUTO_TEST_CASE(SplitterQueueDescriptor_Validate_WrongWindow)
"match input.");
BOOST_CHECK_THROW(RefSplitterFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
- // invalid since window extends past the boundary of input tensor
+ // Invalid, since window extends past the boundary of input tensor.
std::vector<unsigned int> wOrigin3 = {0, 0, 15, 0};
armnn::SplitterQueueDescriptor::ViewOrigin window3(wOrigin3);
invalidData.m_ViewOrigins[0] = window3;
@@ -259,7 +259,7 @@ BOOST_AUTO_TEST_CASE(MergerQueueDescriptor_Validate_WrongWindow)
AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr);
AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
- // invalid since it has only 3 dimensions while the input tensor is 4d
+ // Invalid, since it has only 3 dimensions while the input tensor is 4d.
std::vector<unsigned int> wOrigin = {0, 0, 0};
armnn::MergerQueueDescriptor::ViewOrigin window(wOrigin);
invalidData.m_ViewOrigins.push_back(window);
@@ -268,7 +268,7 @@ BOOST_AUTO_TEST_CASE(MergerQueueDescriptor_Validate_WrongWindow)
"match input.");
BOOST_CHECK_THROW(RefMergerFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
- // invalid since window extends past the boundary of output tensor
+ // Invalid, since window extends past the boundary of output tensor.
std::vector<unsigned int> wOrigin3 = {0, 0, 15, 0};
armnn::MergerQueueDescriptor::ViewOrigin window3(wOrigin3);
invalidData.m_ViewOrigins[0] = window3;
@@ -308,17 +308,17 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputNumbers)
AddInputToWorkload(invalidData, invalidInfo, input1TensorInfo, nullptr);
AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
- // too few inputs
+ // Too few inputs.
BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
AddInputToWorkload(invalidData, invalidInfo, input2TensorInfo, nullptr);
- // correct
+ // Correct.
BOOST_CHECK_NO_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo));
AddInputToWorkload(invalidData, invalidInfo, input3TensorInfo, nullptr);
- // too many inputs
+ // Too many inputs.
BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
@@ -331,7 +331,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes)
unsigned int shape1[] = {1, 1, 2, 1};
unsigned int shape2[] = {1, 1, 3, 2};
- // Incompatible shapes even with broadcasting
+ // Incompatible shapes even with broadcasting.
{
input1TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32);
input2TensorInfo = armnn::TensorInfo(4, shape2, armnn::DataType::Float32);
@@ -347,7 +347,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes)
BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
- // Output size not compatible with input sizes
+ // Output size not compatible with input sizes.
{
input1TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32);
input2TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32);
@@ -360,7 +360,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes)
AddInputToWorkload(invalidData, invalidInfo, input2TensorInfo, nullptr);
AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
- // output differs
+ // Output differs.
BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
}
@@ -374,7 +374,7 @@ BOOST_AUTO_TEST_CASE(MultiplicationQueueDescriptor_Validate_InputTensorDimension
constexpr unsigned int input0Shape[] = { 2, 2, 4, 4 };
constexpr std::size_t dimensionCount = std::extent<decltype(input0Shape)>::value;
- // Check dimension consistency for input tensors
+ // Checks dimension consistency for input tensors.
for (unsigned int dimIndex = 0; dimIndex < dimensionCount; ++dimIndex)
{
unsigned int input1Shape[dimensionCount];
@@ -399,7 +399,7 @@ BOOST_AUTO_TEST_CASE(MultiplicationQueueDescriptor_Validate_InputTensorDimension
BOOST_CHECK_THROW(RefMultiplicationFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
- // Check dimension consistency for input and output tensors
+ // Checks dimension consistency for input and output tensors.
for (unsigned int dimIndex = 0; dimIndex < dimensionCount; ++dimIndex)
{
unsigned int outputShape[dimensionCount];
@@ -430,7 +430,7 @@ BOOST_AUTO_TEST_CASE(ReshapeQueueDescriptor_Validate_MismatchingNumElements)
armnn::TensorInfo inputTensorInfo;
armnn::TensorInfo outputTensorInfo;
- // The input and output shapes should have the same number of elements, but these don't
+ // The input and output shapes should have the same number of elements, but these don't.
unsigned int inputShape[] = { 1, 1, 2, 3 };
unsigned int outputShape[] = { 1, 1, 1, 2 };
@@ -443,8 +443,29 @@ BOOST_AUTO_TEST_CASE(ReshapeQueueDescriptor_Validate_MismatchingNumElements)
AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr);
AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
- // InvalidArgumentException is expected, because the number of elements don't match
+ // InvalidArgumentException is expected, because the number of elements don't match.
BOOST_CHECK_THROW(RefReshapeFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException);
}
+
+BOOST_AUTO_TEST_CASE(LstmQueueDescriptor_Validate)
+{
+ armnn::TensorInfo inputTensorInfo;
+ armnn::TensorInfo outputTensorInfo;
+
+ unsigned int inputShape[] = { 1, 2 };
+ unsigned int outputShape[] = { 1 };
+
+ inputTensorInfo = armnn::TensorInfo(2, inputShape, armnn::DataType::Float32);
+ outputTensorInfo = armnn::TensorInfo(1, outputShape, armnn::DataType::Float32);
+
+ LstmQueueDescriptor invalidData;
+ WorkloadInfo invalidInfo;
+
+ AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr);
+ AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
+
+ BOOST_CHECK_THROW(invalidData.Validate(invalidInfo), armnn::InvalidArgumentException);
+}
+
BOOST_AUTO_TEST_SUITE_END()