From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- src/runtime/Allocator.cpp | 2 +- src/runtime/BlobLifetimeManager.cpp | 30 +- src/runtime/BlobMemoryPool.cpp | 6 +- src/runtime/CL/CLBufferAllocator.cpp | 3 +- src/runtime/CL/CLGEMMHeuristicsHandle.cpp | 3 +- src/runtime/CL/CLHelpers.cpp | 41 +- src/runtime/CL/CLMemory.cpp | 12 +- src/runtime/CL/CLMemoryRegion.cpp | 26 +- src/runtime/CL/CLOperator.cpp | 5 +- src/runtime/CL/CLRuntimeContext.cpp | 6 +- src/runtime/CL/CLScheduler.cpp | 47 +- src/runtime/CL/CLSubTensor.cpp | 10 +- src/runtime/CL/CLTensorAllocator.cpp | 40 +- src/runtime/CL/CLTuner.cpp | 90 +- src/runtime/CL/ICLSimpleFunction.cpp | 5 +- src/runtime/CL/Utils.cpp | 16 +- src/runtime/CL/functions/CLActivationLayer.cpp | 22 +- src/runtime/CL/functions/CLArgMinMaxLayer.cpp | 63 +- .../CL/functions/CLBatchNormalizationLayer.cpp | 37 +- src/runtime/CL/functions/CLBatchToSpaceLayer.cpp | 30 +- src/runtime/CL/functions/CLBitwiseAnd.cpp | 10 +- src/runtime/CL/functions/CLBitwiseNot.cpp | 5 +- src/runtime/CL/functions/CLBitwiseOr.cpp | 10 +- src/runtime/CL/functions/CLBitwiseXor.cpp | 8 +- .../CL/functions/CLBoundingBoxTransform.cpp | 19 +- src/runtime/CL/functions/CLCast.cpp | 22 +- src/runtime/CL/functions/CLChannelShuffleLayer.cpp | 7 +- src/runtime/CL/functions/CLComparison.cpp | 37 +- src/runtime/CL/functions/CLConcatenateLayer.cpp | 28 +- src/runtime/CL/functions/CLConv3D.cpp | 39 +- .../functions/CLConvertFullyConnectedWeights.cpp | 32 +- src/runtime/CL/functions/CLConvolutionLayer.cpp | 102 +- src/runtime/CL/functions/CLCopy.cpp | 15 +- src/runtime/CL/functions/CLCrop.cpp | 48 +- src/runtime/CL/functions/CLCropResize.cpp | 194 ++-- src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 74 +- .../CL/functions/CLDeconvolutionLayerUpsample.cpp | 17 +- src/runtime/CL/functions/CLDepthConvertLayer.cpp | 26 +- src/runtime/CL/functions/CLDepthToSpaceLayer.cpp | 8 +- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 151 +-- src/runtime/CL/functions/CLDequantizationLayer.cpp | 17 +- .../CL/functions/CLDirectConvolutionLayer.cpp | 46 +- .../CL/functions/CLDirectDeconvolutionLayer.cpp | 80 +- .../CL/functions/CLElementwiseOperations.cpp | 206 ++-- .../CL/functions/CLElementwiseUnaryLayer.cpp | 78 +- src/runtime/CL/functions/CLFFT1D.cpp | 30 +- src/runtime/CL/functions/CLFFT2D.cpp | 16 +- src/runtime/CL/functions/CLFFTConvolutionLayer.cpp | 123 ++- src/runtime/CL/functions/CLFill.cpp | 17 +- src/runtime/CL/functions/CLFlattenLayer.cpp | 24 +- src/runtime/CL/functions/CLFloor.cpp | 12 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 64 +- .../CL/functions/CLFuseBatchNormalization.cpp | 57 +- src/runtime/CL/functions/CLGEMM.cpp | 56 +- .../CL/functions/CLGEMMConvolutionLayer.cpp | 88 +- .../CL/functions/CLGEMMDeconvolutionLayer.cpp | 196 ++-- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 38 +- src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 34 +- src/runtime/CL/functions/CLGather.cpp | 8 +- .../CL/functions/CLGenerateProposalsLayer.cpp | 192 ++-- .../CL/functions/CLIndirectConvolutionLayer.cpp | 44 +- .../CL/functions/CLInstanceNormalizationLayer.cpp | 36 +- src/runtime/CL/functions/CLL2NormalizeLayer.cpp | 10 +- src/runtime/CL/functions/CLLSTMLayer.cpp | 575 +++++++---- src/runtime/CL/functions/CLLSTMLayerQuantized.cpp | 410 +++++--- src/runtime/CL/functions/CLLogicalAnd.cpp | 26 +- src/runtime/CL/functions/CLLogicalNot.cpp | 14 +- src/runtime/CL/functions/CLLogicalOr.cpp | 26 +- src/runtime/CL/functions/CLMatMul.cpp | 29 +- src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp | 21 +- .../functions/CLMeanStdDevNormalizationLayer.cpp | 7 +- src/runtime/CL/functions/CLNormalizationLayer.cpp | 20 +- .../CL/functions/CLNormalizePlanarYUVLayer.cpp | 20 +- src/runtime/CL/functions/CLPReluLayer.cpp | 23 +- src/runtime/CL/functions/CLPadLayer.cpp | 41 +- src/runtime/CL/functions/CLPermute.cpp | 18 +- .../CL/functions/CLPixelWiseMultiplication.cpp | 82 +- src/runtime/CL/functions/CLPooling3dLayer.cpp | 20 +- src/runtime/CL/functions/CLPoolingLayer.cpp | 31 +- src/runtime/CL/functions/CLPriorBoxLayer.cpp | 34 +- src/runtime/CL/functions/CLQLSTMLayer.cpp | 942 ++++++++++------- src/runtime/CL/functions/CLQuantizationLayer.cpp | 10 +- src/runtime/CL/functions/CLRNNLayer.cpp | 53 +- src/runtime/CL/functions/CLROIAlignLayer.cpp | 20 +- src/runtime/CL/functions/CLROIPoolingLayer.cpp | 19 +- src/runtime/CL/functions/CLRange.cpp | 5 +- src/runtime/CL/functions/CLReduceMean.cpp | 90 +- src/runtime/CL/functions/CLReductionOperation.cpp | 65 +- src/runtime/CL/functions/CLReorgLayer.cpp | 7 +- src/runtime/CL/functions/CLReshapeLayer.cpp | 14 +- src/runtime/CL/functions/CLReverse.cpp | 7 +- src/runtime/CL/functions/CLScale.cpp | 15 +- src/runtime/CL/functions/CLSelect.cpp | 8 +- src/runtime/CL/functions/CLSlice.cpp | 41 +- src/runtime/CL/functions/CLSoftmaxLayer.cpp | 22 +- src/runtime/CL/functions/CLSpaceToBatchLayer.cpp | 71 +- src/runtime/CL/functions/CLSpaceToDepthLayer.cpp | 10 +- src/runtime/CL/functions/CLSplit.cpp | 3 +- src/runtime/CL/functions/CLStackLayer.cpp | 21 +- src/runtime/CL/functions/CLStridedSlice.cpp | 81 +- src/runtime/CL/functions/CLTile.cpp | 8 +- src/runtime/CL/functions/CLTranspose.cpp | 12 +- src/runtime/CL/functions/CLUnstack.cpp | 38 +- .../CL/functions/CLWinogradConvolutionLayer.cpp | 65 +- src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp | 240 ++--- src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp | 34 +- src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp | 169 +-- src/runtime/CL/gemm/CLGEMMKernelSelection.h | 3 +- .../gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp | 74 +- .../CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h | 7 +- src/runtime/CL/mlgo/Common.h | 40 +- src/runtime/CL/mlgo/HeuristicTree.cpp | 89 +- src/runtime/CL/mlgo/HeuristicTree.h | 24 +- src/runtime/CL/mlgo/MLGOHeuristics.cpp | 99 +- src/runtime/CL/mlgo/MLGOHeuristics.h | 6 +- src/runtime/CL/mlgo/MLGOParser.cpp | 188 ++-- src/runtime/CL/mlgo/MLGOParser.h | 9 +- src/runtime/CL/mlgo/Utils.cpp | 48 +- src/runtime/CL/mlgo/Utils.h | 10 +- src/runtime/CL/tuners/CLTuningParametersList.cpp | 50 +- src/runtime/CPP/CPPScheduler.cpp | 94 +- src/runtime/CPP/SingleThreadScheduler.cpp | 11 +- .../CPPBoxWithNonMaximaSuppressionLimit.cpp | 157 +-- .../CPP/functions/CPPDetectionOutputLayer.cpp | 312 +++--- .../CPP/functions/CPPDetectionPostProcessLayer.cpp | 414 +++++--- .../CPP/functions/CPPNonMaximumSuppression.cpp | 21 +- src/runtime/CPP/functions/CPPTopKV.cpp | 5 +- src/runtime/IScheduler.cpp | 77 +- src/runtime/ISimpleLifetimeManager.cpp | 27 +- src/runtime/IWeightsManager.cpp | 45 +- src/runtime/Memory.cpp | 9 +- src/runtime/MemoryManagerOnDemand.cpp | 5 +- src/runtime/NEON/INEOperator.cpp | 7 +- src/runtime/NEON/INESimpleFunction.cpp | 4 +- src/runtime/NEON/INESimpleFunctionNoBorder.cpp | 5 +- src/runtime/NEON/functions/NEActivationLayer.cpp | 17 +- src/runtime/NEON/functions/NEAddMulAdd.cpp | 44 +- src/runtime/NEON/functions/NEArgMinMaxLayer.cpp | 15 +- .../NEON/functions/NEArithmeticAddition.cpp | 26 +- .../NEON/functions/NEArithmeticSubtraction.cpp | 26 +- .../NEON/functions/NEBatchNormalizationLayer.cpp | 25 +- src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp | 13 +- src/runtime/NEON/functions/NEBitwiseAnd.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseNot.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseOr.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseXor.cpp | 3 +- .../NEON/functions/NEBoundingBoxTransform.cpp | 11 +- src/runtime/NEON/functions/NECast.cpp | 14 +- .../NEON/functions/NEChannelShuffleLayer.cpp | 1 + src/runtime/NEON/functions/NEConcatenateLayer.cpp | 30 +- src/runtime/NEON/functions/NEConv3D.cpp | 27 +- .../functions/NEConvertFullyConnectedWeights.cpp | 24 +- src/runtime/NEON/functions/NEConvolutionLayer.cpp | 84 +- src/runtime/NEON/functions/NECopy.cpp | 12 +- src/runtime/NEON/functions/NECropResize.cpp | 54 +- .../NEON/functions/NEDeconvolutionLayer.cpp | 116 ++- src/runtime/NEON/functions/NEDepthConvertLayer.cpp | 17 +- src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp | 1 + .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 193 ++-- .../NEON/functions/NEDequantizationLayer.cpp | 10 +- .../NEON/functions/NEDetectionPostProcessLayer.cpp | 60 +- .../NEON/functions/NEDirectConvolutionLayer.cpp | 27 +- .../NEON/functions/NEElementwiseOperations.cpp | 152 +-- .../NEON/functions/NEElementwiseUnaryLayer.cpp | 15 +- src/runtime/NEON/functions/NEFFT1D.cpp | 29 +- src/runtime/NEON/functions/NEFFT2D.cpp | 8 +- .../NEON/functions/NEFFTConvolutionLayer.cpp | 105 +- src/runtime/NEON/functions/NEFill.cpp | 10 +- src/runtime/NEON/functions/NEFillBorder.cpp | 9 +- src/runtime/NEON/functions/NEFlattenLayer.cpp | 22 +- src/runtime/NEON/functions/NEFloor.cpp | 12 +- .../NEON/functions/NEFullyConnectedLayer.cpp | 77 +- .../NEON/functions/NEFuseBatchNormalization.cpp | 42 +- src/runtime/NEON/functions/NEGEMM.cpp | 62 +- src/runtime/NEON/functions/NEGEMMConv2d.cpp | 39 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 76 +- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 61 +- .../NEON/functions/NEGEMMLowpOutputStage.cpp | 33 +- src/runtime/NEON/functions/NEGather.cpp | 3 +- .../NEON/functions/NEGenerateProposalsLayer.cpp | 187 ++-- .../functions/NEInstanceNormalizationLayer.cpp | 26 +- src/runtime/NEON/functions/NEL2NormalizeLayer.cpp | 4 +- src/runtime/NEON/functions/NELSTMLayer.cpp | 510 +++++---- .../NEON/functions/NELSTMLayerQuantized.cpp | 383 +++++-- src/runtime/NEON/functions/NELogical.cpp | 12 +- src/runtime/NEON/functions/NEMatMul.cpp | 28 +- src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp | 24 +- .../functions/NEMeanStdDevNormalizationLayer.cpp | 3 +- .../NEON/functions/NENormalizationLayer.cpp | 10 +- src/runtime/NEON/functions/NEPReluLayer.cpp | 14 +- src/runtime/NEON/functions/NEPadLayer.cpp | 90 +- src/runtime/NEON/functions/NEPermute.cpp | 10 +- .../NEON/functions/NEPixelWiseMultiplication.cpp | 50 +- src/runtime/NEON/functions/NEPooling3dLayer.cpp | 17 +- src/runtime/NEON/functions/NEPoolingLayer.cpp | 21 +- src/runtime/NEON/functions/NEPriorBoxLayer.cpp | 13 +- src/runtime/NEON/functions/NEQLSTMLayer.cpp | 1082 ++++++++++++-------- src/runtime/NEON/functions/NEQuantizationLayer.cpp | 10 +- src/runtime/NEON/functions/NERNNLayer.cpp | 44 +- src/runtime/NEON/functions/NEROIAlignLayer.cpp | 10 +- src/runtime/NEON/functions/NEROIPoolingLayer.cpp | 17 +- src/runtime/NEON/functions/NERange.cpp | 6 +- src/runtime/NEON/functions/NEReduceMean.cpp | 55 +- .../NEON/functions/NEReductionOperation.cpp | 75 +- src/runtime/NEON/functions/NEReorderLayer.cpp | 19 +- src/runtime/NEON/functions/NEReorgLayer.cpp | 3 +- src/runtime/NEON/functions/NEReshapeLayer.cpp | 12 +- src/runtime/NEON/functions/NEReverse.cpp | 8 +- src/runtime/NEON/functions/NEScale.cpp | 48 +- src/runtime/NEON/functions/NESelect.cpp | 1 + src/runtime/NEON/functions/NESlice.cpp | 35 +- src/runtime/NEON/functions/NESoftmaxLayer.cpp | 21 +- src/runtime/NEON/functions/NESpaceToBatchLayer.cpp | 38 +- src/runtime/NEON/functions/NESpaceToDepthLayer.cpp | 4 +- src/runtime/NEON/functions/NESplit.cpp | 2 +- src/runtime/NEON/functions/NEStackLayer.cpp | 11 +- src/runtime/NEON/functions/NEStridedSlice.cpp | 59 +- src/runtime/NEON/functions/NETile.cpp | 3 +- src/runtime/NEON/functions/NETranspose.cpp | 10 +- src/runtime/NEON/functions/NEUnstack.cpp | 34 +- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 44 +- src/runtime/OMP/OMPScheduler.cpp | 14 +- src/runtime/OffsetLifetimeManager.cpp | 20 +- src/runtime/OffsetMemoryPool.cpp | 8 +- src/runtime/OperatorTensor.cpp | 3 +- src/runtime/PoolManager.cpp | 11 +- src/runtime/RuntimeContext.cpp | 3 +- src/runtime/Scheduler.cpp | 13 +- src/runtime/SchedulerFactory.cpp | 2 +- src/runtime/SchedulerUtils.cpp | 19 +- src/runtime/SubTensor.cpp | 3 +- src/runtime/Tensor.cpp | 3 +- src/runtime/TensorAllocator.cpp | 19 +- src/runtime/Utils.cpp | 15 +- .../ClDirectConvDefaultConfigBifrost.cpp | 67 +- .../direct_conv/ClDirectConvDefaultConfigBifrost.h | 20 +- .../ClDirectConvDefaultConfigValhall.cpp | 125 ++- .../direct_conv/ClDirectConvDefaultConfigValhall.h | 20 +- .../direct_conv/ClDirectConvKernelConfig.h | 4 +- .../direct_conv/IClDirectConvKernelConfig.h | 14 +- .../dwc_native/ClDWCNativeDefaultConfigBifrost.cpp | 133 ++- .../dwc_native/ClDWCNativeDefaultConfigBifrost.h | 42 +- .../dwc_native/ClDWCNativeDefaultConfigValhall.cpp | 127 ++- .../dwc_native/ClDWCNativeDefaultConfigValhall.h | 35 +- .../dwc_native/ClDWCNativeHeuristicsHelpers.cpp | 8 +- .../dwc_native/ClDWCNativeKernelConfig.h | 2 +- .../dwc_native/IClDWCNativeKernelConfig.h | 16 +- .../ClIndirectConvDefaultConfigValhall.cpp | 60 +- .../ClIndirectConvDefaultConfigValhall.h | 9 +- .../indirect_conv/ClIndirectConvKernelConfig.h | 2 +- .../indirect_conv/IClIndirectConvKernelConfig.h | 12 +- .../ClMatMulNativeDefaultConfigValhall.cpp | 396 +++---- .../ClMatMulNativeDefaultConfigValhall.h | 11 +- .../matmul_native/ClMatMulNativeHelpers.cpp | 42 +- .../matmul_native/ClMatMulNativeHelpers.h | 15 +- .../matmul_native/ClMatMulNativeKernelConfig.h | 4 +- .../matmul_native/IClMatMulNativeKernelConfig.h | 9 +- 257 files changed, 8597 insertions(+), 5501 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp index ef7c62d64b..eca712dbf0 100644 --- a/src/runtime/Allocator.cpp +++ b/src/runtime/Allocator.cpp @@ -22,9 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/Allocator.h" -#include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/core/Error.h" +#include "arm_compute/runtime/MemoryRegion.h" #include diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp index bea55d8eb9..8a0fc05c39 100644 --- a/src/runtime/BlobLifetimeManager.cpp +++ b/src/runtime/BlobLifetimeManager.cpp @@ -35,8 +35,7 @@ namespace arm_compute { -BlobLifetimeManager::BlobLifetimeManager() - : _blobs() +BlobLifetimeManager::BlobLifetimeManager() : _blobs() { } @@ -62,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings() ARM_COMPUTE_ERROR_ON(_active_group == nullptr); // Sort free blobs requirements in descending order. - _free_blobs.sort([](const Blob & ba, const Blob & bb) - { - return ba.max_size > bb.max_size; - }); + _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; }); // Create group sizes vector std::vector group_sizes; - std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b) - { - return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() }; - }); + std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), + [](const Blob &b) { + return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()}; + }); // Update blob sizes size_t max_size = std::max(_blobs.size(), group_sizes.size()); _blobs.resize(max_size); group_sizes.resize(max_size); - std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs) - { - return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) }; - }); + std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), + [](BlobInfo lhs, BlobInfo rhs) + { + return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), + std::max(lhs.owners, rhs.owners)}; + }); // Calculate group mappings auto &group_mappings = _active_group->mappings(); int blob_idx = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp index 88e280537c..a2f63ef52b 100644 --- a/src/runtime/BlobMemoryPool.cpp +++ b/src/runtime/BlobMemoryPool.cpp @@ -47,7 +47,7 @@ BlobMemoryPool::~BlobMemoryPool() void BlobMemoryPool::acquire(MemoryMappings &handles) { // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(_blobs[handle.second].get()); @@ -56,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles) void BlobMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); @@ -78,7 +78,7 @@ void BlobMemoryPool::allocate_blobs(const std::vector &blob_info) { ARM_COMPUTE_ERROR_ON(!_allocator); - for(const auto &bi : blob_info) + for (const auto &bi : blob_info) { _blobs.push_back(_allocator->make_region(bi.size, bi.alignment)); } diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp index e06ef3d37d..b4545b93bf 100644 --- a/src/runtime/CL/CLBufferAllocator.cpp +++ b/src/runtime/CL/CLBufferAllocator.cpp @@ -35,7 +35,8 @@ namespace arm_compute void *CLBufferAllocator::allocate(size_t size, size_t alignment) { ARM_COMPUTE_UNUSED(alignment); - cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) }; + cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, + nullptr, nullptr)}; return static_cast(buf); } diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp index 7168259fcd..d680dc08bb 100644 --- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp +++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp @@ -27,8 +27,7 @@ namespace arm_compute { -CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() - : _heuristics(std::make_unique()) +CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique()) { } CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default; diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp index 5b4bbbcde0..eb28ecbf8d 100644 --- a/src/runtime/CL/CLHelpers.cpp +++ b/src/runtime/CL/CLHelpers.cpp @@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void * @return A pointer to the context properties which can be used to create an opencl context */ -void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array &prop) +void initialise_context_properties(const cl::Platform &platform, + const cl::Device &device, + std::array &prop) { ARM_COMPUTE_UNUSED(device); #if defined(ARM_COMPUTE_ASSERTS_ENABLED) // Query devices in the context for cl_arm_printf support - if(arm_compute::device_supports_extension(device, "cl_arm_printf")) + if (arm_compute::device_supports_extension(device, "cl_arm_printf")) { // Create a cl_context with a printf_callback and user specified buffer size. - std::array properties_printf = - { + std::array properties_printf = { CL_CONTEXT_PLATFORM, reinterpret_cast(platform()), // Enable a printf callback function for this context. CL_PRINTF_CALLBACK_ARM, reinterpret_cast(printf_callback), // Request a minimum printf buffer size of 4MB for devices in the // context that support this extension. - CL_PRINTF_BUFFERSIZE_ARM, 0x1000, - 0 - }; + CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0}; prop = properties_printf; } else #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED) { - std::array properties = - { - CL_CONTEXT_PLATFORM, reinterpret_cast(platform()), - 0 - }; + std::array properties = {CL_CONTEXT_PLATFORM, + reinterpret_cast(platform()), 0}; std::copy(properties.begin(), properties.end(), prop.begin()); }; } @@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) cl::Platform::get(&platforms); ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform"); - cl::Platform selected_platform{ nullptr }; + cl::Platform selected_platform{nullptr}; // If the user has selected the Native platform, return the first available. - switch(cl_backend_type) + switch (cl_backend_type) { case CLBackendType::Native: selected_platform = platforms[0]; break; case CLBackendType::Clvk: - for(auto p : platforms) + for (auto p : platforms) { std::string res = p.getInfo(); - if(res.find("clvk") != std::string::npos) + if (res.find("clvk") != std::string::npos) { selected_platform = p; break; @@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) ARM_COMPUTE_ERROR("Unsupported backend type"); } - if(!selected_platform()) + if (!selected_platform()) { ARM_COMPUTE_ERROR("No valid platform found"); } @@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) return selected_platform; } -std::tuple -create_opencl_context_and_device(CLBackendType cl_backend_type) +std::tuple create_opencl_context_and_device(CLBackendType cl_backend_type) { ARM_COMPUTE_ERROR_ON(!opencl_is_available()); cl::Platform p = select_preferable_platform(cl_backend_type); @@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) std::vector platform_devices; p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices); ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device"); - device = platform_devices[0]; - cl_int err = CL_SUCCESS; - std::array properties = { 0, 0, 0, 0, 0, 0, 0 }; + device = platform_devices[0]; + cl_int err = CL_SUCCESS; + std::array properties = {0, 0, 0, 0, 0, 0, 0}; initialise_context_properties(p, device, properties); cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context"); @@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush) { ARM_COMPUTE_ERROR_ON_NULLPTR(kernel); - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr); ctx->gpu_scheduler()->enqueue(*kernel, flush); diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp index a1743c56e6..c6ee6fde83 100644 --- a/src/runtime/CL/CLMemory.cpp +++ b/src/runtime/CL/CLMemory.cpp @@ -24,24 +24,22 @@ #include "arm_compute/runtime/CL/CLMemory.h" #include "arm_compute/core/Error.h" + #include "support/Cast.h" namespace arm_compute { -CLMemory::CLMemory() - : _region(nullptr), _region_owned(nullptr) +CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr) { } -CLMemory::CLMemory(const std::shared_ptr &memory) - : _region(nullptr), _region_owned(memory) +CLMemory::CLMemory(const std::shared_ptr &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -CLMemory::CLMemory(ICLMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } @@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr region) _region_owned = utils::cast::polymorphic_downcast_unique_ptr(std::move(region)); _region = _region_owned.get(); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp index 00f91a0ffb..835958b816 100644 --- a/src/runtime/CL/CLMemoryRegion.cpp +++ b/src/runtime/CL/CLMemoryRegion.cpp @@ -29,10 +29,7 @@ namespace arm_compute { ICLMemoryRegion::ICLMemoryRegion(size_t size) - : IMemoryRegion(size), - _ctx(CLScheduler::get().context()), - _mapping(nullptr), - _mem() + : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem() { } @@ -57,17 +54,15 @@ std::unique_ptr ICLMemoryRegion::extract_subregion(size_t offset, return nullptr; } -CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) - : ICLMemoryRegion(size) +CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size) { - if(_size != 0) + if (_size != 0) { _mem = cl::Buffer(CLScheduler::get().context(), flags, _size); } } -CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) - : ICLMemoryRegion(buffer.getInfo()) +CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo()) { _mem = buffer; } @@ -102,10 +97,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q) ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment) : ICLMemoryRegion(size), _ptr(nullptr) { - if(size != 0) + if (size != 0) { _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment); - if(_ptr != nullptr) + if (_ptr != nullptr) { _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr); } @@ -114,7 +109,7 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a ICLSVMMemoryRegion::~ICLSVMMemoryRegion() { - if(_ptr != nullptr) + if (_ptr != nullptr) { try { @@ -125,7 +120,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion() _mem = cl::Buffer(); clSVMFree(_ctx.get(), _ptr); } - catch(...) + catch (...) { } } @@ -144,7 +139,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(_ptr == nullptr); - clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr); + clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, + nullptr); _mapping = _ptr; return _mapping; } @@ -163,7 +159,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { - if(blocking) + if (blocking) { clFinish(q.get()); } diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp index 075a544077..89d4520038 100644 --- a/src/runtime/CL/CLOperator.cpp +++ b/src/runtime/CL/CLOperator.cpp @@ -30,14 +30,13 @@ namespace arm_compute { namespace experimental { -ICLOperator::ICLOperator(IRuntimeContext *ctx) - : _kernel(), _ctx(ctx), _workspace() +ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } void ICLOperator::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp index 5083b4b0c5..b426b8c304 100644 --- a/src/runtime/CL/CLRuntimeContext.cpp +++ b/src/runtime/CL/CLRuntimeContext.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -29,7 +30,10 @@ namespace arm_compute { CLRuntimeContext::CLRuntimeContext() - : _gpu_owned_scheduler(std::make_unique()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type() + : _gpu_owned_scheduler(std::make_unique()), + _gpu_scheduler(_gpu_owned_scheduler.get()), + _symbols(), + _backend_type() { _symbols.load_default(); auto ctx_dev_err = create_opencl_context_and_device(_backend_type); diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index b7a4dff45d..f0a42f55fd 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event() void CLScheduler::tune_kernel_static(ICLKernel &kernel) { - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { _cl_tuner->tune_kernel_static(kernel); } @@ -95,8 +96,16 @@ bool CLScheduler::is_initialised() const std::once_flag CLScheduler::_initialize_symbols; CLScheduler::CLScheduler() - : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native), _job_chaining_enabled(true), - _job_chaining_size(1), _job_chaining_count(0) + : _context(), + _queue(), + _target(GPUTarget::MIDGARD), + _is_initialised(false), + _cl_tuner(nullptr), + _gemm_heuristics(nullptr), + _backend_type(CLBackendType::Native), + _job_chaining_enabled(true), + _job_chaining_size(1), + _job_chaining_count(0) { } @@ -107,9 +116,12 @@ CLScheduler &CLScheduler::get() return scheduler; } -void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h) +void CLScheduler::default_init_with_context(cl::Device &device, + cl::Context &ctx, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h) { - if(!_is_initialised) + if (!_is_initialised) { const std::string cl_kernels_folder("./cl_kernels/"); cl::CommandQueue queue = cl::CommandQueue(ctx, device); @@ -121,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) { - if(!_is_initialised) + if (!_is_initialised) { cl::Context ctx; cl::Device dev; @@ -151,7 +163,12 @@ void CLScheduler::set_context(cl::Context context) CLKernelLibrary::get().set_context(_context); } -void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +void CLScheduler::init(cl::Context context, + cl::CommandQueue queue, + const cl::Device &device, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h, + CLBackendType cl_backend_type) { set_context(std::move(context)); _queue = std::move(queue); @@ -164,21 +181,21 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush) { - ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, - "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ + ARM_COMPUTE_ERROR_ON_MSG( + !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); const bool inject_memory = !tensors.empty(); // Tune the kernel if the CLTuner has been provided - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel); } // Run kernel inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue); - if(_job_chaining_enabled) + if (_job_chaining_enabled) { ++_job_chaining_count; } @@ -188,9 +205,9 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f void CLScheduler::flush_queue(bool flush) { - if(_job_chaining_enabled) + if (_job_chaining_enabled) { - if(_job_chaining_count >= _job_chaining_size) + if (_job_chaining_count >= _job_chaining_size) { _job_chaining_count = 0; /* @@ -199,14 +216,14 @@ void CLScheduler::flush_queue(bool flush) the CPU activity for job-scheduling. For eg. job-chain size goes from 1, 2, 4, 8 and 16 */ - if(_job_chaining_size < 16) + if (_job_chaining_size < 16) { _job_chaining_size <<= 1; } _queue.flush(); } } - else if(flush) + else if (flush) { _queue.flush(); } diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp index 14936ae23c..ace820bbb7 100644 --- a/src/runtime/CL/CLSubTensor.cpp +++ b/src/runtime/CL/CLSubTensor.cpp @@ -29,12 +29,14 @@ using namespace arm_compute; -CLSubTensor::CLSubTensor() - : _parent(nullptr), _info() +CLSubTensor::CLSubTensor() : _parent(nullptr), _info() { } -CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent) +CLSubTensor::CLSubTensor(ICLTensor *parent, + const TensorShape &tensor_shape, + const Coordinates &coords, + bool extend_parent) : _parent(nullptr), _info() { ARM_COMPUTE_ERROR_ON(parent == nullptr); @@ -81,7 +83,7 @@ void CLSubTensor::unmap() uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr); - if(_parent->buffer() == nullptr) + if (_parent->buffer() == nullptr) { _parent->map(q, blocking); } diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index f85b8ae777..e6457218c7 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr; std::unique_ptr allocate_region(size_t size, cl_uint alignment) { // Try fine-grain SVM - std::unique_ptr region = std::make_unique(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, - size, - alignment); + std::unique_ptr region = + std::make_unique(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment); // Try coarse-grain SVM in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique(CL_MEM_READ_WRITE, size, alignment); } // Try legacy buffer memory in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); } @@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset) * @param[in] qinfo Quantization info * @param[in] pad_size Pad size to use in case array needs to be padded for computation purposes */ -void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size) +void populate_quantization_info(CLFloatArray &scale, + CLInt32Array &offset, + const QuantizationInfo &qinfo, + size_t pad_size) { clear_quantization_arrays(scale, offset); @@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const const size_t element_size = sizeof(std::remove_reference::type::value_type); scale = CLFloatArray(num_elements + pad_size); scale.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data()); + CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, + qinfo.scale().data()); - if(!qinfo.offset().empty()) + if (!qinfo.offset().empty()) { // Create offset array - const std::vector &qoffset = qinfo.offset(); - const size_t offset_element_size = sizeof(std::remove_reference::type::value_type); - offset = CLInt32Array(num_elements + pad_size); + const std::vector &qoffset = qinfo.offset(); + const size_t offset_element_size = sizeof(std::remove_reference::type::value_type); + offset = CLInt32Array(num_elements + pad_size); offset.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data()); + CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, + num_elements * offset_element_size, qinfo.offset().data()); } } } // namespace @@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext CLQuantization CLTensorAllocator::quantization() const { - return { &_scale, &_offset }; + return {&_scale, &_offset}; } uint8_t *CLTensorAllocator::data() @@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const void CLTensorAllocator::allocate() { // Allocate tensor backing memory - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { // Perform memory allocation - if(static_global_cl_allocator != nullptr) + if (static_global_cl_allocator != nullptr) { _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0)); } @@ -146,7 +150,7 @@ void CLTensorAllocator::allocate() } // Allocate and fill the quantization parameter arrays - if(is_data_type_quantized_per_channel(info().data_type())) + if (is_data_type_quantized_per_channel(info().data_type())) { const size_t pad_size = 0; populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size); @@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator) uint8_t *CLTensorAllocator::lock() { - if(_ctx) + if (_ctx) { return map(_ctx->gpu_scheduler()->queue(), true); } @@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock() void CLTensorAllocator::unlock() { ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr); - if(_ctx) + if (_ctx) { unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast(_memory.region()->buffer())); } diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 445638f01f..0d62fe3afe 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" + #include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" @@ -37,19 +38,23 @@ namespace arm_compute { CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) - : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info) + : real_clEnqueueNDRangeKernel(nullptr), + _tuning_params_table(), + _lws_table(), + _kernel_event(), + _tune_new_kernels(tune_new_kernels), + _tuning_info(tuning_info) { } struct CLTuner::IKernelData { - virtual ~IKernelData() = default; + virtual ~IKernelData() = default; virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0; }; struct DefaultKernelData : public CLTuner::IKernelData { - DefaultKernelData(ITensorPack &tensors) - : _tensors{ tensors } + DefaultKernelData(ITensorPack &tensors) : _tensors{tensors} { } ~DefaultKernelData() override = default; @@ -100,16 +105,17 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel) void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Get the configuration ID from the kernel and append GPU target name and number of available compute units - const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); + const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned - if(kernel.config_id() != arm_compute::default_config_id) + if (kernel.config_id() != arm_compute::default_config_id) { auto p = _tuning_params_table.find(config_id); - if(p == _tuning_params_table.end()) + if (p == _tuning_params_table.end()) { - if(_tune_new_kernels) + if (_tune_new_kernels) { // Find the optimal LWS for the kernel CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data); @@ -119,7 +125,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) // Set Local-Workgroup-Size kernel.set_lws_hint(opt_tuning_params.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(opt_tuning_params.get_wbsm()); } @@ -129,7 +135,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Set Local-Workgroup-Size kernel.set_lws_hint(p->second.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(p->second.get_wbsm()); } @@ -138,7 +144,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) } void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) { - DefaultKernelData data{ tensors }; + DefaultKernelData data{tensors}; do_tune_kernel_dynamic(kernel, &data); } @@ -154,7 +160,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat cl::CommandQueue queue_profiler; // Extract real OpenCL function to intercept - if(real_clEnqueueNDRangeKernel == nullptr) + if (real_clEnqueueNDRangeKernel == nullptr) { real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr; } @@ -165,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat // Check if we can use the OpenCL timer with the default queue cl_command_queue_properties props = default_queue.getInfo(); - if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + if ((props & CL_QUEUE_PROFILING_ENABLE) == 0) { // Set the queue for profiling queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); @@ -176,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat } // Start intercepting enqueues: - auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, cl_event * event) + auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, + const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { - if(this->kernel_event_is_set()) + if (this->kernel_event_is_set()) { // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues. return CL_SUCCESS; } cl_event tmp; - cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, + num_events_in_wait_list, event_wait_list, &tmp); // Set OpenCL event this->set_cl_kernel_event(tmp); - if(event != nullptr) + if (event != nullptr) { //return cl_event from the intercepted call clRetainEvent(tmp); @@ -209,9 +217,10 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() /// Please see COMPMID-5934 cl::NDRange gws = kernel.get_cached_gws(); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, - "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", - kernel.config_id().c_str(), to_string(gws).c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL( + arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(), + to_string(gws).c_str()); queue_profiler.finish(); @@ -224,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat // Construct the list of tuning parameters values to be tested based on the tuner mode. auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws); - for(size_t i = 0; i < tuning_list->size(); ++i) + for (size_t i = 0; i < tuning_list->size(); ++i) { CLTuningParams tuning_test = (*tuning_list)[i]; // Setting the lws @@ -234,19 +243,18 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat auto z = lws_test[2]; const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1); - if(invalid_lws) + if (invalid_lws) { continue; } kernel.set_lws_hint(lws_test); - if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) + if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) { cl_int wbsm_test = tuning_test.get_wbsm(); kernel.set_wbsm_hint(wbsm_test); } - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, - "[CLTuner] Trying LWS: %s, WBSM: %d", + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d", to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel @@ -260,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat _kernel_event = nullptr; // Check the execution time - if(diff < min_exec_time) + if (diff < min_exec_time) { min_exec_time = diff; opt_tuning_params.set_lws(tuning_test.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { opt_tuning_params.set_wbsm(tuning_test.get_wbsm()); } @@ -292,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno); } std::string line; bool header_line = true; - while(!std::getline(fs, line).fail()) + while (!std::getline(fs, line).fail()) { - if(header_line) + if (header_line) { header_line = false; size_t pos_lws = line.find("lws"); size_t pos_wbsm = line.find("wbsm"); _tuning_info.tune_wbsm = false; - if(pos_lws != std::string::npos || pos_wbsm != std::string::npos) + if (pos_lws != std::string::npos || pos_wbsm != std::string::npos) { // The file has in the first line the parameters it has been tuned on - if(pos_wbsm != std::string::npos) + if (pos_wbsm != std::string::npos) { _tuning_info.tune_wbsm = true; } // Once the line with the tuning parameter is read we can // read the next one to start collecting the values - if(std::getline(fs, line).fail()) + if (std::getline(fs, line).fail()) { break; } @@ -324,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename) CLTuningParams tuning_params; size_t pos = line.find(";"); - if(pos == std::string::npos) + if (pos == std::string::npos) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } std::string kernel_id = line.substr(0, pos); line.erase(0, pos + 1); - if(!tuning_params.from_string(_tuning_info, line)) + if (!tuning_params.from_string(_tuning_info, line)) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } @@ -341,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename) bool CLTuner::save_to_file(const std::string &filename) const { - if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) + if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) { return false; } @@ -350,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const fs.open(filename, std::ios::out); std::string header_string = ""; header_string += "lws"; - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { - if(!header_string.empty()) + if (!header_string.empty()) { header_string += " "; } header_string += "wbsm"; } fs << header_string << std::endl; - for(auto const &kernel_data : _tuning_params_table) + for (auto const &kernel_data : _tuning_params_table) { CLTuningParams tun_pams(kernel_data.second); fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl; diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp index 4530537789..bc782c3a2c 100644 --- a/src/runtime/CL/ICLSimpleFunction.cpp +++ b/src/runtime/CL/ICLSimpleFunction.cpp @@ -26,15 +26,14 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" using namespace arm_compute; ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT - : _kernel(), - _border_handler(std::make_unique()), - _ctx(ctx) + : _kernel(), _border_handler(std::make_unique()), _ctx(ctx) { } diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp index da3d4850bf..294396c28a 100644 --- a/src/runtime/CL/Utils.cpp +++ b/src/runtime/CL/Utils.cpp @@ -35,20 +35,20 @@ namespace arm_compute void restore_program_cache_from_file(const std::string &filename) { std::ifstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - if(!CLScheduler::get().is_initialised()) + if (!CLScheduler::get().is_initialised()) { arm_compute::CLScheduler::get().default_init(); } - while(!cache_file.eof()) + while (!cache_file.eof()) { size_t name_len = 0; size_t binary_len = 0; cache_file.read(reinterpret_cast(&name_len), sizeof(size_t)); cache_file.read(reinterpret_cast(&binary_len), sizeof(size_t)); - if(name_len == 0 || binary_len == 0) + if (name_len == 0 || binary_len == 0) { break; } @@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename) tmp.resize(binary_len); cache_file.read(reinterpret_cast(binary.data()), binary_len); cl::Context context = arm_compute::CLScheduler::get().context(); - cl::Program::Binaries binaries{ binary }; + cl::Program::Binaries binaries{binary}; std::vector devices = context.getInfo(); cl::Program program(context, devices, binaries); program.build(); @@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename) void save_program_cache_to_file(const std::string &filename) { - if(CLScheduler::get().is_initialised()) + if (CLScheduler::get().is_initialised()) { std::ofstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - for(const auto &it : CLKernelLibrary::get().get_built_programs()) + for (const auto &it : CLKernelLibrary::get().get_built_programs()) { std::vector> binaries = it.second.getInfo(); ARM_COMPUTE_ERROR_ON(binaries.size() != 1); diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index f324b1a68c..c035644e4a 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClActivation.h" @@ -35,18 +36,17 @@ namespace arm_compute { struct CLActivationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr op{nullptr}; }; -CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) - : _impl(std::make_unique()) +CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique()) { _impl->ctx = ctx; } -CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; +CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default; CLActivationLayer::~CLActivationLayer() = default; @@ -55,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info); } -void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +void CLActivationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -66,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info); } -Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return opencl::ClActivation::validate(input, output, act_info); } diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp index b30d739025..f9bbd31e8a 100644 --- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp +++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp @@ -27,31 +27,39 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis() + : _memory_group(std::move(memory_manager)), + _not_reshaped_output(), + _arg_min_max_kernel(), + _reshape(), + _reduction_axis() { } CLArgMinMaxLayer::~CLArgMinMaxLayer() = default; -Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); DataType output_data_type = DataType::S32; @@ -59,17 +67,18 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - if(output->total_size() != 0) + if (output->total_size() != 0) { output_data_type = output->data_type(); - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } auto shape_before_reshape = input->tensor_shape(); shape_before_reshape.set(axis, 1); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; @@ -85,20 +94,36 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op); } -void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op) +void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int axis, + ICLTensor *output, + const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); _reduction_axis = axis; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); - - TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() }; + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + DataType output_data_type = + (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + TensorShape not_reshaped_output_shape{input->info()->tensor_shape()}; not_reshaped_output_shape.set(axis, 1); - auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(not_reshaped_output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _arg_min_max_kernel = std::make_unique(); _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op); diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index e8affc0853..0c371c4171 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -30,9 +30,8 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" namespace arm_compute { @@ -43,24 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer() CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default; -void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon, +void CLBatchNormalizationLayer::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, float epsilon, - ActivationLayerInfo act_info) +void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); } -Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info); } @@ -69,4 +84,4 @@ void CLBatchNormalizationLayer::run() { CLScheduler::get().enqueue(*_norm_kernel, true); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp index d7a409128d..a3798daf61 100644 --- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp @@ -30,14 +30,12 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" namespace arm_compute { -CLBatchToSpaceLayer::CLBatchToSpaceLayer() - : _batch_to_space_kernel(std::make_unique()) +CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique()) { } @@ -49,29 +47,43 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); _batch_to_space_kernel->configure(compile_context, input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayer::configure( + const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output); _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return CLBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp index a4712ed3f1..7bfd0e3677 100644 --- a/src/runtime/CL/functions/CLBitwiseAnd.cpp +++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -36,11 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseAnd::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, BitwiseOperation::AND); _kernel = std::move(k); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp index 5964b92447..9763915c02 100644 --- a/src/runtime/CL/functions/CLBitwiseNot.cpp +++ b/src/runtime/CL/functions/CLBitwiseNot.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -43,4 +42,4 @@ void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLT k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT); _kernel = std::move(k); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp index a07bf17bb2..dd3171b982 100644 --- a/src/runtime/CL/functions/CLBitwiseOr.cpp +++ b/src/runtime/CL/functions/CLBitwiseOr.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -36,11 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseOr::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, BitwiseOperation::OR); _kernel = std::move(k); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp index f65e2e406c..5bee4b37ec 100644 --- a/src/runtime/CL/functions/CLBitwiseXor.cpp +++ b/src/runtime/CL/functions/CLBitwiseXor.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include @@ -36,7 +35,10 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseXor::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp index 48583bfaf3..76e626fd75 100644 --- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp +++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp @@ -23,18 +23,24 @@ */ #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h" -#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" namespace arm_compute { -void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); @@ -44,7 +50,10 @@ void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, _kernel = std::move(k); } -Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp index 10f7cc2065..42ec8f7ee0 100644 --- a/src/runtime/CL/functions/CLCast.cpp +++ b/src/runtime/CL/functions/CLCast.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCast.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include @@ -37,16 +37,15 @@ namespace arm_compute { struct CLCast::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLCast::CLCast() - : _impl(std::make_unique()) +CLCast::CLCast() : _impl(std::make_unique()) { } -CLCast::CLCast(CLCast &&) = default; +CLCast::CLCast(CLCast &&) = default; CLCast &CLCast::operator=(CLCast &&) = default; CLCast::~CLCast() = default; @@ -55,7 +54,10 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy configure(CLKernelLibrary::get().get_compile_context(), input, output, policy); } -void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) +void CLCast::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, policy); @@ -74,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con void CLCast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp index 021f28f238..1ee4789816 100644 --- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp +++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { ARM_COMPUTE_LOG_PARAMS(input, output, num_groups); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp index 192a266f0f..2f54371e88 100644 --- a/src/runtime/CL/functions/CLComparison.cpp +++ b/src/runtime/CL/functions/CLComparison.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLComparisonKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLComparisonKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { @@ -37,25 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparison::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation); auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, operation); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } -Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { return CLComparisonKernel::validate(input1, input2, output, operation); } @@ -67,25 +75,30 @@ void CLComparisonStatic::configure(ICLTensor *input1, ICLTensor *input2, IC } template -void CLComparisonStatic::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLComparisonStatic::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { auto k = std::make_unique(); k->configure(compile_context, input1, input2, output, COP); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } template -Status CLComparisonStatic::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status +CLComparisonStatic::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { return CLComparisonKernel::validate(input1, input2, output, COP); } diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index 0a8884f4e3..9df1c34593 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -24,24 +24,23 @@ #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClConcatenate.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConcatenate.h" namespace arm_compute { struct CLConcatenateLayer::Impl { std::vector srcs{}; - ICLTensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr op{ nullptr }; + ICLTensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr op{nullptr}; }; -CLConcatenateLayer::CLConcatenateLayer() - : _impl(std::make_unique()) +CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique()) { } @@ -56,7 +55,10 @@ void CLConcatenateLayer::configure(std::vector &inputs_vector configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); } -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, + std::vector &inputs_vector, + ICLTensor *output, + size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis); @@ -68,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op = std::make_unique(); std::vector inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -76,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis); } -Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +Status CLConcatenateLayer::validate(const std::vector &inputs_vector, + const ITensorInfo *output, + size_t axis) { return opencl::ClConcatenate::validate(inputs_vector, output, axis); } @@ -84,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector &inpu void CLConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp index 729b973b6a..9d1b368f72 100644 --- a/src/runtime/CL/functions/CLConv3D.cpp +++ b/src/runtime/CL/functions/CLConv3D.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/CL/functions/CLConv3D.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/gpu/cl/operators/ClDirectConv3d.h" namespace arm_compute @@ -32,29 +33,38 @@ using namespace arm_compute::experimental; struct CLConv3D::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLConv3D::CLConv3D() - : _impl(std::make_unique()) +CLConv3D::CLConv3D() : _impl(std::make_unique()) { } CLConv3D::~CLConv3D() = default; -void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info) +void CLConv3D::configure(const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) { configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info); } -void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info) +void CLConv3D::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate( + src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); _impl->src = src; _impl->weights = weights; @@ -62,10 +72,15 @@ void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTenso _impl->dst = dst; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); + _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), + _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); } -Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status CLConv3D::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info); } diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp index b3efe5c8a0..2298f2a669 100644 --- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp +++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp @@ -27,33 +27,37 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" namespace arm_compute { struct CLConvertFullyConnectedWeights::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() - : _impl(std::make_unique()) +CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique()) { } CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default; -void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout); } -void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout); @@ -63,8 +67,10 @@ void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_c _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index f3c05adb47..7767b45a01 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -28,11 +28,11 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClConv2d.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -43,41 +43,59 @@ struct CLConvolutionLayer::Impl { MemoryGroup memory_group{}; std::shared_ptr memory_manager{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - std::unique_ptr func{ nullptr }; + std::unique_ptr func{nullptr}; }; -CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } CLConvolutionLayer::~CLConvolutionLayer() = default; -void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math, num_groups); } -void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, - weights_info, CLScheduler::get().target())) + switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, + weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::DIRECT: @@ -85,7 +103,8 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT case ConvolutionMethod::GEMM: { auto f = std::make_unique(); - f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + f->configure(compile_context, input->info(), weights->info(), + ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); _impl->op = std::move(f); break; } @@ -101,40 +120,52 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT break; } - if(_impl->op) + if (_impl->op) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CLConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); const GPUTarget gpu_target = CLScheduler::get().target(); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) + switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::DIRECT: case ConvolutionMethod::INDIRECT: case ConvolutionMethod::GEMM: { - ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); break; } case ConvolutionMethod::FFT: { // Validate FFT-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, + act_info, enable_fast_math)); break; } default: @@ -145,8 +176,15 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math) +ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const ActivationLayerInfo &act_info, + const GPUTarget gpu_target, + const Size2D &dilation, + bool enable_fast_math) { const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1); return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target); @@ -158,7 +196,7 @@ void CLConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(_impl->func) + if (_impl->func) { _impl->func->run(); } @@ -170,7 +208,7 @@ void CLConvolutionLayer::run() void CLConvolutionLayer::prepare() { - if(_impl->func) + if (_impl->func) { _impl->func->prepare(); } diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp index 56400b67a0..a4f2b0634f 100644 --- a/src/runtime/CL/functions/CLCopy.cpp +++ b/src/runtime/CL/functions/CLCopy.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCopy.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCopy.h" #include @@ -38,16 +38,15 @@ namespace arm_compute { struct CLCopy::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLCopy::CLCopy() - : _impl(std::make_unique()) +CLCopy::CLCopy() : _impl(std::make_unique()) { } -CLCopy::CLCopy(CLCopy &&) = default; +CLCopy::CLCopy(CLCopy &&) = default; CLCopy &CLCopy::operator=(CLCopy &&) = default; CLCopy::~CLCopy() = default; diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp index 35ea17cfc2..fc29c43827 100644 --- a/src/runtime/CL/functions/CLCrop.cpp +++ b/src/runtime/CL/functions/CLCrop.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCrop.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCrop.h" #include @@ -38,27 +38,38 @@ namespace arm_compute { struct CLCrop::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLCrop::CLCrop() - : _impl(std::make_unique()) +CLCrop::CLCrop() : _impl(std::make_unique()) { } -CLCrop::CLCrop(CLCrop &&) = default; +CLCrop::CLCrop(CLCrop &&) = default; CLCrop &CLCrop::operator=(CLCrop &&) = default; CLCrop::~CLCrop() = default; -void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { - configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window); + configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, + dst_window); } -void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); @@ -67,10 +78,17 @@ void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor _impl->dst = dst; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, + extrapolation_value, dst_window); } -Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status CLCrop::validate(const ITensorInfo *input, + const ITensorInfo *output, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window); } diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp index d8fc38d99e..821412b149 100644 --- a/src/runtime/CL/functions/CLCropResize.cpp +++ b/src/runtime/CL/functions/CLCropResize.cpp @@ -25,19 +25,26 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/common/utils/Log.h" - #include namespace arm_compute { namespace { -inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index) +inline void configure_crop(const ICLTensor *input, + ICLTensor *crop_boxes, + ICLTensor *box_ind, + ICLTensor *output, + uint32_t crop_box_ind, + Coordinates &start, + Coordinates &end, + uint32_t &batch_index) { batch_index = *(reinterpret_cast(box_ind->ptr_to_element(Coordinates(crop_box_ind)))); @@ -50,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers. start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast(abs(end[0] - start[0])) + 1, static_cast(abs(end[1] - start[1])) + 1); + end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast(abs(end[0] - start[0])) + 1, + static_cast(abs(end[1] - start[1])) + 1); output->info()->set_tensor_shape(out_shape); } } // namespace CLCropResize::CLCropResize() - : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions() + : _input(nullptr), + _boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _scale(), + _copy(), + _crop_results(), + _scaled_results(), + _internal_functions() { } CLCropResize::~CLCropResize() = default; -Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status CLCropResize::validate(const ITensorInfo *input, + ITensorInfo *boxes, + ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1}, + input->dimension(3) - 1, extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -83,20 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen return Status{}; } -void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { - configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value); + configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, + extrapolation_value); } -void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind); - ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); - TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); + TensorShape output_shape = + TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -122,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // kernels used for cropping and scaling. _boxes->map(CLScheduler::get().queue()); _box_ind->map(CLScheduler::get().queue()); - for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box) + for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box) { auto crop_tensor = std::make_unique(); TensorInfo crop_result_info(1, DataType::F32); @@ -143,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index); auto scale_kernel = std::make_unique(); - scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT }); + scale_kernel->configure( + compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT}); _scale.emplace_back(std::move(scale_kernel)); Window win = calculate_max_window(*_output->info()); @@ -159,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT bool is_width_flipped = end[0] < start[0]; bool is_height_flipped = end[1] < start[1]; /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array rows_out_of_bounds{ 0 }; + std::array rows_out_of_bounds{0}; /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array cols_out_of_bounds{ 0 }; - if(is_height_flipped) + std::array cols_out_of_bounds{0}; + if (is_height_flipped) { - rows_out_of_bounds[0] = start[1] >= static_cast(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; - rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) : 0; + rows_out_of_bounds[0] = start[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(start[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; + rows_out_of_bounds[1] = + end[1] < 0 ? std::min(-end[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) + : 0; } else { - rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) : 0; - rows_out_of_bounds[1] = end[1] >= static_cast(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; + rows_out_of_bounds[0] = + start[1] < 0 + ? std::min(-start[1], static_cast(_crop_results[num_box].get()->info()->dimension(2))) + : 0; + rows_out_of_bounds[1] = end[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(end[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - cols_out_of_bounds[0] = start[0] >= static_cast(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; - cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) : 0; + cols_out_of_bounds[0] = start[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(start[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; + cols_out_of_bounds[1] = + end[0] < 0 ? std::min(-end[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) + : 0; } else { - cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) : 0; - cols_out_of_bounds[1] = end[0] >= static_cast(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; + cols_out_of_bounds[0] = + start[0] < 0 + ? std::min(-start[0], static_cast(_crop_results[num_box].get()->info()->dimension(1))) + : 0; + cols_out_of_bounds[1] = end[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(end[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; } Window full_window = calculate_max_window(*_crop_results[num_box].get()->info()); @@ -203,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds // with the extrapolation value using memset. // First for the rows before the in bounds rows. - if(rows_out_of_bounds[0] > 0) + if (rows_out_of_bounds[0] > 0) { Window slice_fill_rows_before(full_window); slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } Window slice_in(full_window); - slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); - slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); - - int rows_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1]; - if(rows_in_bounds > 0) + slice_in.set(2, + Window::Dimension(rows_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); + slice_in.set(1, + Window::Dimension(cols_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); + + int rows_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(2)) - + rows_out_of_bounds[0] - rows_out_of_bounds[1]; + if (rows_in_bounds > 0) { // Fill all elements that share a row with an in bounds element with the extrapolation value. - if(cols_out_of_bounds[0] > 0) + if (cols_out_of_bounds[0] > 0) { Window slice_fill_cols_before(slice_in); slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } - if(cols_out_of_bounds[1] > 0) + if (cols_out_of_bounds[1] > 0) { Window slice_fill_cols_after(slice_in); - slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1)); + slice_fill_cols_after.set( + 1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(1), 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } // Copy all elements within the input bounds from the input tensor. - int cols_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1]; - if(cols_in_bounds > 0) + int cols_in_bounds = static_cast(_crop_results[num_box].get()->info()->dimension(1)) - + cols_out_of_bounds[0] - cols_out_of_bounds[1]; + if (cols_in_bounds > 0) { - Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], - is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] }; - Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, - is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 }; + Coordinates2D start_in{ + is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], + is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]}; + Coordinates2D end_in{ + is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, + is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1}; auto kernel = std::make_unique(); - kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in); + kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, + extrapolation_value, &slice_in); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } } // Fill all rows after the in bounds elements with the extrapolation value. - if(rows_out_of_bounds[1] > 0) + if (rows_out_of_bounds[1] > 0) { Window slice_fill_rows_after(full_window); - slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1)); + slice_fill_rows_after.set( + 2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(2), 1)); auto kernel = std::make_unique(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } @@ -277,18 +357,18 @@ void CLCropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _internal_functions.size(); ++i) + for (unsigned int i = 0; i < _internal_functions.size(); ++i) { _internal_functions[i]->run(); } CLScheduler::get().sync(); - for(auto &kernel : _scale) + for (auto &kernel : _scale) { kernel->run(); } CLScheduler::get().sync(); - for(auto &kernel : _copy) + for (auto &kernel : _copy) { kernel->run(); } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 4421a18f2a..e988ab0ac4 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -25,16 +25,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/IClOperator.h" #include "src/gpu/cl/operators/ClTransposedConvolution.h" -#include "src/common/utils/Log.h" - #include #include #include @@ -44,11 +44,11 @@ using namespace arm_compute::misc::shape_calculator; struct CLDeconvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; CLDeconvolutionLayer::~CLDeconvolutionLayer() = default; @@ -58,24 +58,35 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr memor { } -void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info); } -void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info); - switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), + deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { auto op = std::make_unique(); - op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info); + op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, + output->info(), deconv_info); _impl->src = input; _impl->weights = weights; @@ -105,22 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC } } -Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +Status CLDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { // Validate transposed convolution operator - ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); break; } case DeconvolutionMethod::UPSCALE_CONV2D: { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); break; } case DeconvolutionMethod::GEMM: @@ -137,12 +154,16 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { return DeconvolutionMethod::UPSCALE_CONV2D; } @@ -154,11 +175,12 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); const size_t ofm = weights->tensor_shape()[idx_n]; - if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second) + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second) { // We observe better performance for FP32 types only when ofm <= 16. // A better heuristic is required for selecting the method for FP16 data types. - if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16))) + if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16))) { return DeconvolutionMethod::DIRECT; } @@ -175,7 +197,7 @@ void CLDeconvolutionLayer::run() { prepare(); - if(_impl->op != nullptr) + if (_impl->op != nullptr) { // Optimized Operator will be used ITensorPack pack; @@ -195,7 +217,7 @@ void CLDeconvolutionLayer::run() void CLDeconvolutionLayer::prepare() { - if(_impl->op == nullptr) + if (_impl->op == nullptr) { _function->prepare(); } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index 0b428f5b17..b92bf903a6 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -27,22 +27,21 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" -#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" namespace arm_compute { CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT - : _upsample(std::make_unique()), - _fill(), - _output(nullptr) + : _upsample(std::make_unique()), _fill(), _output(nullptr) { } CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default; -Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) +Status +CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) { return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info); } @@ -52,13 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) +void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, info); _output = output; - _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); + _fill.configure(compile_context, _output, + PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); _upsample->configure(compile_context, input, _output, info); } diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp index cac3f51013..6d2fea974e 100644 --- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCast.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include @@ -37,16 +37,15 @@ namespace arm_compute { struct CLDepthConvertLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLDepthConvertLayer::CLDepthConvertLayer() - : _impl(std::make_unique()) +CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique()) { } -CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; +CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default; CLDepthConvertLayer::~CLDepthConvertLayer() = default; @@ -55,7 +54,11 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift); } -void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy, + uint32_t shift) { ARM_COMPUTE_UNUSED(shift); ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift); @@ -70,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy); } -Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return opencl::ClCast::validate(input, output, policy); @@ -78,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void CLDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp index 98531e7cac..9477c7f81d 100644 --- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h" -#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" #include @@ -36,7 +35,10 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index dcb982fa56..873601bb11 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" -#include "src/common/utils/Log.h" - namespace arm_compute { using namespace arm_compute::misc; @@ -63,25 +63,33 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptrinfo(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output != nullptr ? output->info() : input->info(), - conv_info, - depth_multiplier, - act_info, - dilation)); + ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, + output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); _is_quantized = is_data_type_quantized(input->info()->data_type()); @@ -96,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = output; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -119,10 +127,12 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont CLTensor *output_multipliers_to_use = nullptr; CLTensor *output_shifts_to_use = nullptr; - if(_is_quantized) + if (_is_quantized) { - const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); - const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; + const size_t idx_c = + get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const size_t num_filters = + (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); @@ -132,16 +142,18 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont } // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); - const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; _dwc_native_kernel->set_target(gpu_target); _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, - dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use); + dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, + output_shifts_to_use); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); @@ -151,22 +163,27 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont _permuted_output.allocator()->allocate(); } - if(_is_quantized) + if (_is_quantized) { _output_multipliers.allocator()->allocate(); _output_shifts.allocator()->allocate(); } } -Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); const bool in_place = input == output || output == nullptr; - if(in_place) + if (in_place) { output = input; } @@ -174,21 +191,23 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > + input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > + input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); const GPUTarget gpu_target = CLScheduler::get().target(); - const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; const bool needs_permute = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized(input->data_type()); TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); - if(is_quantized) + if (is_quantized) { - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); @@ -201,40 +220,57 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe } } - if(needs_permute) + if (needs_permute) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout"); TensorShape permuted_input_shape = input->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); - const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; - TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); + const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation}; + TensorShape permuted_output_shape = + shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_input = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_weights = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_output = output->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NHWC); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, - dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info, + &output_multipliers_shifts_info, &output_multipliers_shifts_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); } else { // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, - &output_multipliers_shifts_info)); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input, weights, conv_info, dilation, depth_multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, + &output_multipliers_shifts_info)); } return Status{}; } @@ -245,12 +281,12 @@ void CLDepthwiseConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_needs_permute) + if (_needs_permute) { _permute_input_to_nhwc.run(); } CLScheduler::get().enqueue(*_dwc_native_kernel); - if(_needs_permute) + if (_needs_permute) { _permute_output_to_nchw.run(); } @@ -258,22 +294,21 @@ void CLDepthwiseConvolutionLayer::run() void CLDepthwiseConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_is_quantized) + if (_is_quantized) { _output_multipliers.map(); _output_shifts.map(); - quantization::compute_quantized_multipliers_and_shifts(_input->info(), - _original_weights->info(), - _output != nullptr ? _output->info() : _input->info(), - reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), - reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); + quantization::compute_quantized_multipliers_and_shifts( + _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(), + reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), + reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); _output_shifts.unmap(); } - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp index 64c6b5d91c..20162a03db 100644 --- a/src/runtime/CL/functions/CLDequantizationLayer.cpp +++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp @@ -26,22 +26,21 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClDequantize.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClDequantize.h" namespace arm_compute { struct CLDequantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLDequantizationLayer::CLDequantizationLayer() - : _impl(std::make_unique()) +CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique()) { } CLDequantizationLayer::~CLDequantizationLayer() = default; @@ -51,7 +50,9 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output) configure(CLKernelLibrary::get().get_compile_context(), input, output); } -void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +void CLDequantizationLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, output); _impl->src = input; diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 752e0e4a60..d6dae0d732 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -28,37 +28,46 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/operators/ClActivation.h" -#include "src/gpu/cl/operators/ClDirectConv2d.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClActivation.h" +#include "src/gpu/cl/operators/ClDirectConv2d.h" namespace arm_compute { struct CLDirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLDirectConvolutionLayer::CLDirectConvolutionLayer() - : _impl(std::make_unique()) +CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique()) { } -CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; +CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default; -void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); @@ -69,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info); @@ -87,4 +101,4 @@ void CLDirectConvolutionLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index 88c3c6193c..3717f30ae1 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -26,15 +26,15 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - #include #include @@ -55,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptrdata_layout(); @@ -70,20 +75,22 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - if(input->data_type() != weights->data_type()) + if (input->data_type() != weights->data_type()) { - ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || + !is_data_type_quantized_asymmetric(input->data_type())); } - if(bias != nullptr) + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -102,24 +109,39 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen unsigned int deconv_pad_y = 0; const unsigned int stride_x = info.stride().first; const unsigned int stride_y = info.stride().second; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } -void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info); } -void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info); @@ -141,15 +163,19 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); + auto out_dims = + deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); _is_prepared = weights_info.retain_internal_weights(); @@ -158,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape unsigned int deconv_pad_x = 0; unsigned int deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0; unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0; @@ -179,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _scaled_output.allocator()->init(scale_out_info); // configure scale function - const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, + deconv_pad_bottom, DimensionRoundingType::FLOOR); _scale_f.configure(compile_context, input, &_scaled_output, upsample_info); // Setup the function to convolve the upscaled output @@ -191,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _flip_axis.allocator()->allocate(); _flip_axis.map(true); auto axis_data = reinterpret_cast(_flip_axis.buffer()); - if(weights->info()->data_layout() == DataLayout::NHWC) + if (weights->info()->data_layout() == DataLayout::NHWC) { axis_data[0] = 1; axis_data[1] = 2; @@ -216,7 +244,7 @@ void CLDirectDeconvolutionLayer::run() void CLDirectDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -229,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare() _conv_f.prepare(); // Free flipped weights - if(!_weights_flipped.is_used()) + if (!_weights_flipped.is_used()) { _weights_flipped.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 936b37fb31..d9529f0b7f 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -26,8 +26,8 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClAdd.h" #include "src/gpu/cl/operators/ClElementwiseOperations.h" #include "src/gpu/cl/operators/ClSub.h" @@ -36,26 +36,30 @@ namespace arm_compute { struct CLArithmeticAddition::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLArithmeticAddition::CLArithmeticAddition() - : _impl(std::make_unique()) +CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique()) { } -CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; +CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default; CLArithmeticAddition::~CLArithmeticAddition() = default; -void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticAddition::configure( + ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticAddition::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClAdd::validate(input1, input2, output, policy, act_info); } @@ -82,26 +90,33 @@ void CLArithmeticAddition::run() struct CLArithmeticSubtraction::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLArithmeticSubtraction::CLArithmeticSubtraction() - : _impl(std::make_unique()) +CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique()) { } -CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; +CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction::~CLArithmeticSubtraction() = default; -void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticSubtraction::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClSub::validate(input1, input2, output, policy, act_info); } @@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run() struct CLArithmeticDivision::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLArithmeticDivision::CLArithmeticDivision() - : _impl(std::make_unique()) +CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique()) { } -CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; +CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default; CLArithmeticDivision::~CLArithmeticDivision() = default; -void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLArithmeticDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info); } @@ -173,26 +201,32 @@ void CLArithmeticDivision::run() struct CLElementwiseMax::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwiseMax::CLElementwiseMax() - : _impl(std::make_unique()) +CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique()) { } -CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; +CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default; CLElementwiseMax::~CLElementwiseMax() = default; -void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMax::validate(input1, input2, output, act_info); } @@ -218,26 +255,32 @@ void CLElementwiseMax::run() struct CLElementwiseMin::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwiseMin::CLElementwiseMin() - : _impl(std::make_unique()) +CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique()) { } -CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; +CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default; CLElementwiseMin::~CLElementwiseMin() = default; -void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMin::validate(input1, input2, output, act_info); } @@ -263,26 +309,32 @@ void CLElementwiseMin::run() struct CLElementwiseSquaredDiff::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() - : _impl(std::make_unique()) +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique()) { } -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff() = default; -void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info); } @@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run() struct CLElementwisePower::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLElementwisePower::CLElementwisePower() - : _impl(std::make_unique()) +CLElementwisePower::CLElementwisePower() : _impl(std::make_unique()) { } -CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; +CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default; CLElementwisePower::~CLElementwisePower() = default; -void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwisePower::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp index 9dcd2d1891..3043c26feb 100644 --- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp +++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClElementwiseUnary.h" @@ -32,17 +33,16 @@ namespace arm_compute { struct CLRsqrtLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLRsqrtLayer::CLRsqrtLayer() - : _impl(std::make_unique()) +CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique()) { } -CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; +CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default; CLRsqrtLayer::~CLRsqrtLayer() = default; @@ -74,17 +74,16 @@ void CLRsqrtLayer::run() struct CLExpLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLExpLayer::CLExpLayer() - : _impl(std::make_unique()) +CLExpLayer::CLExpLayer() : _impl(std::make_unique()) { } -CLExpLayer::CLExpLayer(CLExpLayer &&) = default; +CLExpLayer::CLExpLayer(CLExpLayer &&) = default; CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default; CLExpLayer::~CLExpLayer() = default; @@ -116,17 +115,16 @@ void CLExpLayer::run() struct CLNegLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLNegLayer::CLNegLayer() - : _impl(std::make_unique()) +CLNegLayer::CLNegLayer() : _impl(std::make_unique()) { } -CLNegLayer::CLNegLayer(CLNegLayer &&) = default; +CLNegLayer::CLNegLayer(CLNegLayer &&) = default; CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default; CLNegLayer::~CLNegLayer() = default; @@ -157,17 +155,16 @@ void CLNegLayer::run() struct CLSinLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLSinLayer::CLSinLayer() - : _impl(std::make_unique()) +CLSinLayer::CLSinLayer() : _impl(std::make_unique()) { } -CLSinLayer::CLSinLayer(CLSinLayer &&) = default; +CLSinLayer::CLSinLayer(CLSinLayer &&) = default; CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default; CLSinLayer::~CLSinLayer() = default; @@ -198,17 +195,16 @@ void CLSinLayer::run() struct CLAbsLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLAbsLayer::CLAbsLayer() - : _impl(std::make_unique()) +CLAbsLayer::CLAbsLayer() : _impl(std::make_unique()) { } -CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; +CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default; CLAbsLayer::~CLAbsLayer() = default; @@ -239,17 +235,16 @@ void CLAbsLayer::run() struct CLLogLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogLayer::CLLogLayer() - : _impl(std::make_unique()) +CLLogLayer::CLLogLayer() : _impl(std::make_unique()) { } -CLLogLayer::CLLogLayer(CLLogLayer &&) = default; +CLLogLayer::CLLogLayer(CLLogLayer &&) = default; CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default; CLLogLayer::~CLLogLayer() = default; @@ -280,17 +275,16 @@ void CLLogLayer::run() struct CLRoundLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLRoundLayer::CLRoundLayer() - : _impl(std::make_unique()) +CLRoundLayer::CLRoundLayer() : _impl(std::make_unique()) { } -CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; +CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default; CLRoundLayer::~CLRoundLayer() = default; diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp index bd0966b65f..48e9ae824a 100644 --- a/src/runtime/CL/functions/CLFFT1D.cpp +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -26,13 +26,13 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" #include "src/core/utils/helpers/fft.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLFFT1D::CLFFT1D(std::shared_ptr memory_manager) @@ -54,7 +54,10 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) +void CLFFT1D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT1DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config)); @@ -77,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, + digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; _num_ffts = decomposed_vector.size(); _fft_kernels.reserve(_num_ffts); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -93,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels.emplace_back(std::make_unique()); - _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, + ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -123,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); @@ -132,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); @@ -151,13 +157,13 @@ void CLFFT1D::run() CLScheduler::get().enqueue(*_digit_reverse_kernel, false); // Run radix kernels - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); } // Run output scaling - if(_run_scale) + if (_run_scale) { CLScheduler::get().enqueue(*_scale_kernel, true); } diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp index 94fc411355..3857046719 100644 --- a/src/runtime/CL/functions/CLFFT2D.cpp +++ b/src/runtime/CL/functions/CLFFT2D.cpp @@ -26,16 +26,19 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLFFT2D::CLFFT2D(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -46,7 +49,10 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) +void CLFFT2D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT2DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config)); @@ -88,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp index d12e2de3bf..3894b10785 100644 --- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp @@ -25,10 +25,12 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CPP/CPPScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" @@ -38,8 +40,6 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace @@ -50,11 +50,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -104,17 +104,31 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr mem { } -void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); - ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), conv_info, act_info, enable_fast_math)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); _original_weights = weights; @@ -124,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -146,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -167,7 +184,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -175,10 +192,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -202,7 +219,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -214,25 +232,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, + Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -243,7 +264,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -255,7 +276,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(compile_context, output, nullptr, act_info); } @@ -269,8 +290,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_axis.unmap(); } -Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math); @@ -287,24 +313,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x()); } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); } @@ -320,7 +349,7 @@ void CLFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -336,17 +365,17 @@ void CLFFTConvolutionLayer::run() _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer()); _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -354,10 +383,10 @@ void CLFFTConvolutionLayer::run() void CLFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -366,7 +395,7 @@ void CLFFTConvolutionLayer::prepare() const ICLTensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp index 6019a84aba..9bd96a975e 100644 --- a/src/runtime/CL/functions/CLFill.cpp +++ b/src/runtime/CL/functions/CLFill.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClFill.h" @@ -36,16 +37,15 @@ namespace arm_compute { struct CLFill::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLFill::CLFill() - : _impl(std::make_unique()) +CLFill::CLFill() : _impl(std::make_unique()) { } -CLFill::CLFill(CLFill &&) = default; +CLFill::CLFill(CLFill &&) = default; CLFill &CLFill::operator=(CLFill &&) = default; CLFill::~CLFill() = default; @@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window); } -void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window) +void CLFill::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + const PixelValue &constant_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index 32fc37552c..ba1b5372d3 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -26,8 +26,9 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/operators/ClFlatten.h" @@ -36,16 +37,15 @@ namespace arm_compute { struct CLFlattenLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLFlattenLayer::CLFlattenLayer() - : _impl(std::make_unique()) +CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique()) { } -CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; +CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default; CLFlattenLayer::~CLFlattenLayer() = default; @@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique(); _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info()); @@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return opencl::ClFlatten::validate(input, output); @@ -83,4 +85,4 @@ void CLFlattenLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp index 8739e1803e..4322219dd9 100644 --- a/src/runtime/CL/functions/CLFloor.cpp +++ b/src/runtime/CL/functions/CLFloor.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClFloor.h" @@ -34,16 +35,15 @@ namespace arm_compute { struct CLFloor::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLFloor::CLFloor() - : _impl(std::make_unique()) +CLFloor::CLFloor() : _impl(std::make_unique()) { } -CLFloor::CLFloor(CLFloor &&) = default; +CLFloor::CLFloor(CLFloor &&) = default; CLFloor &CLFloor::operator=(CLFloor &&) = default; CLFloor::~CLFloor() = default; diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 1c162db79a..b30f9e701f 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClFullyConnected.h" @@ -35,21 +36,22 @@ using namespace arm_compute::experimental; struct CLFullyConnectedLayer::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_weights{ nullptr }; + const ITensor *original_weights{nullptr}; ITensorPack run_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - bool is_prepared{ false }; - bool dynamic_weights{ false }; + bool is_prepared{false}; + bool dynamic_weights{false}; }; -CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); @@ -58,39 +60,45 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr mem CLFullyConnectedLayer::~CLFullyConnectedLayer() = default; -void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info); } -void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info)); _impl->op = std::make_unique(); _impl->original_weights = weights; _impl->is_prepared = fc_info.retain_internal_weights; - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); - if(_impl->weights_manager != nullptr) + if (_impl->weights_manager != nullptr) { _impl->weights_manager->manage(_impl->original_weights); } - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } else { @@ -98,14 +106,14 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c _impl->run_pack.add_tensor(ACL_DST, output); } - _impl->dynamic_weights = - !weights->info()->are_values_constant() && - fc_info.transpose_weights && - !fc_info.are_weights_reshaped && - !fc_info.retain_internal_weights; + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, FullyConnectedLayerInfo fc_info) { return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info); @@ -113,7 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn void CLFullyConnectedLayer::run() { - if(!_impl->dynamic_weights) + if (!_impl->dynamic_weights) { prepare(); } @@ -124,7 +132,7 @@ void CLFullyConnectedLayer::run() void CLFullyConnectedLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -133,13 +141,13 @@ void CLFullyConnectedLayer::prepare() _impl->is_prepared = true; // Handle weights managed infrastructure - if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare // This is for cases where multiple functions share the same b (weights) // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference const ITensor *original_b = _impl->original_weights; - if(!original_b->is_used()) + if (!original_b->is_used()) { _impl->weights_manager->pre_mark_as_unused(original_b); } diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp index 7379e9d9fe..e4fbf78e13 100644 --- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp +++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" namespace arm_compute { @@ -41,29 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization() CLFuseBatchNormalization::~CLFuseBatchNormalization() = default; -void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); - _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); + _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, + bn_beta, bn_gamma, epsilon, fbn_type); } -Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void CLFuseBatchNormalization::run() diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 427ea51ab9..871a1d6e27 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemm.h" @@ -40,15 +41,15 @@ using OperatorType = opencl::ClGemm; struct CLGEMM::Impl { - const ICLTensor *b{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *b{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLGEMM::CLGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) @@ -60,12 +61,25 @@ CLGEMM::CLGEMM(std::shared_ptr memory_manager, IWeightsManager * CLGEMM::~CLGEMM() = default; -void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info); } -void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -73,25 +87,33 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor _impl->op = std::make_unique(); _impl->is_prepared = gemm_info.retain_internal_weights(); - _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + alpha, beta, gemm_info); _impl->aux_mem_req = _impl->op->workspace(); // Manage/allocate auxilairy tensors - if(_impl->is_prepared) + if (_impl->is_prepared) { _impl->run_pack.add_const_tensor(ACL_SRC_0, a); _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, _impl->b } }; + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, _impl->b}}; - _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace_tensors = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status CLGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info); } @@ -107,15 +129,15 @@ void CLGEMM::run() void CLGEMM::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->b->mark_as_unused(); } diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index c8c18f35db..aef7cddd7a 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -27,10 +27,11 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemmConv2d.h" #include "support/Cast.h" @@ -47,18 +48,19 @@ using namespace arm_compute::experimental; struct CLGEMMConvolutionLayer::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(memory_manager); @@ -67,40 +69,60 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr m CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default; -void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, num_groups); } -void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); _impl->weights = weights; _impl->op = std::make_unique(); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input }, - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, - { TensorType::ACL_DST, output } - }; - _impl->prep_pack = - { - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = { + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info); @@ -115,14 +137,14 @@ void CLGEMMConvolutionLayer::run() void CLGEMMConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->weights->mark_as_unused(); } diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index 9fc81c11da..7d40cf1829 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -24,15 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include @@ -40,12 +40,13 @@ namespace arm_compute { namespace { -std::pair compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) +std::pair +compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) { Coordinates start; Coordinates end; - if(is_nchw) + if (is_nchw) { start.set(0, deconv_info.pad_left()); start.set(1, deconv_info.pad_top()); @@ -63,13 +64,16 @@ std::pair compute_start_end_slice_coordinates(const IT end.set(2, output_info.dimension(2) - deconv_info.pad_bottom()); } - return { start, end }; + return {start, end}; } -Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info) +Status construct_gemmlowp_output_stage(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + GEMMLowpOutputStageInfo &output_stage_info) { const auto data_type = input->data_type(); - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); @@ -78,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier(0); int output_shift(0); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -122,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptrdata_layout(); - const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; + const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || + deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; const bool is_nchw = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); @@ -144,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso TensorShape nhwc_weights_shape = weights->tensor_shape(); TensorShape nhwc_input_shape = input->tensor_shape(); - if(is_nchw) + if (is_nchw) { permute(nhwc_weights_shape, PermutationVector(2, 0, 1)); permute(nhwc_input_shape, PermutationVector(2, 0, 1)); - TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_input_info = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_input_shape) + .set_data_layout(DataLayout::NCHW); - TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_weights_info = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_weights_shape) + .set_data_layout(DataLayout::NCHW); CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1)); CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1)); } - const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); - const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); + const TensorShape reshaped_shape = + TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); + const TensorInfo reshaped_info = + weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info)); TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]); @@ -166,77 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info)); TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b), - input->dimension(idx_w), - input->dimension(idx_h), - input->dimension(idx_b)); + input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b)); TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true); GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true); GEMMLowpOutputStageInfo output_stage_info; - if(is_quantized) + if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32), - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( + &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, + &gemm_output_info.set_data_type(DataType::S32), gemm_info)); ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), + &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); } const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info); - const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); - TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), stride_info); + const TensorShape deconv_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); - if(padded_input && is_quantized) + if (padded_input && is_quantized) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate( + &col2im_output_info, nullptr, + &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), + output, start_end.first, start_end.second)); } - else if(padded_input) + else if (padded_input) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second)); } - else if(is_quantized) + else if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); } return Status{}; } -void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info); } -void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(), - weights->info(), - bias != nullptr ? bias->info() : nullptr, - output->info(), - deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate( + input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info); _original_weights = weights; - _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || + deconv_info.pad_top() > 0; + _is_nchw = input->info()->data_layout() == DataLayout::NCHW; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; @@ -245,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context // do an outer product in NCHW and then an accumulation through a reduction. This would have two // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction // might be slower than GEMM. - if(_is_nchw) + if (_is_nchw) { _memory_group.manage(&_permuted_input); _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U)); @@ -257,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Reshape the input weights. The weights will be reshaped only once during the call to prepare() - _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0), - weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)), - 1, - input->info()->data_type(), weights->info()->quantization_info())); + _reshaped_weights.allocator()->init( + TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) * + weights_to_use->info()->dimension(2) * + weights_to_use->info()->dimension(3)), + 1, input->info()->data_type(), weights->info()->quantization_info())); _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights); _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t); @@ -269,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true); // Configure output stage for asymmetric quantized types - if(_is_quantized) + if (_is_quantized) { // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original // and restore them back to make it work properly. QuantizationInfo iq_info = input->info()->quantization_info(); QuantizationInfo wq_info = weights->info()->quantization_info(); - input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); - _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); + input_to_use->info()->set_quantization_info( + QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); + _reshaped_weights_t.info()->set_quantization_info( + QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info); @@ -286,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } else { - _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info); + _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, + gemm_info); } - if(_is_nchw) + if (_is_nchw) { _permuted_input.allocator()->allocate(); } @@ -298,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context ICLTensor *slice_output = nullptr; ICLTensor *output_stage_output = nullptr; - if(_padded_input && _is_quantized) + if (_padded_input && _is_quantized) { _memory_group.manage(&_slice_gemm_input); _memory_group.manage(&_gemmlowp_final); @@ -306,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context output_stage_output = &_slice_gemm_input; slice_output = output; } - else if(_padded_input) + else if (_padded_input) { _memory_group.manage(&_slice_gemm_input); deconv_reshape_output = &_slice_gemm_input; slice_output = output; } - else if(_is_quantized) + else if (_is_quantized) { _memory_group.manage(&_gemmlowp_final); deconv_reshape_output = &_gemmlowp_final; @@ -324,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Configure a Col2Im call to reshape the output of GEMM - _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); + _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), + weights->info(), deconv_info); _gemm_output.allocator()->allocate(); - if(_is_quantized) + if (_is_quantized) { GEMMLowpOutputStageInfo output_stage_info; construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info); - _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info); + _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, + output_stage_info); _gemmlowp_final.allocator()->allocate(); } // If the input was padded, the output needs to be sliced. - if(_padded_input) + if (_padded_input) { - const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); + const auto start_end = + compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second); _slice_gemm_input.allocator()->allocate(); } @@ -350,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_nchw) + if (_is_nchw) { _permute_input_to_nhwc.run(); } - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp.run(); } @@ -366,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run() CLScheduler::get().enqueue(*_deconv_reshape, false); - if(_is_quantized) + if (_is_quantized) { _gemmlowp_output_stage.run(); } - if(_padded_input) + if (_padded_input) { _slice_gemm.run(); } @@ -379,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run() void CLGEMMDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->allocate(); _permute_weights_to_nhwc.run(); @@ -392,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare() _reshaped_weights.allocator()->allocate(); _reshape_weights.run(); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->free(); } @@ -401,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare() _transpose_weights.run(); // Prepare gemm - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm.prepare(); } @@ -411,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare() } // Free resources - if(!_reshaped_weights_t.is_used()) + if (!_reshaped_weights_t.is_used()) { _reshaped_weights_t.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index d9029478a1..8bad198658 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -31,12 +31,12 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/IMemoryManager.h" -#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" namespace arm_compute @@ -46,13 +46,13 @@ using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore; struct CLGEMMLowpMatrixMultiplyCore::Impl { - const ICLTensor *b{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *b{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) @@ -63,12 +63,18 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrop = std::make_unique(); _impl->is_prepared = gemm_info.retain_internal_weights(); - _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + gemm_info); _impl->aux_mem_req = _impl->op->workspace(); // Manage/allocate auxilairy tensors - if(_impl->is_prepared) + if (_impl->is_prepared) { _impl->run_pack.add_const_tensor(ACL_SRC_0, a); _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } }; - _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->workspace_tensors = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } } -Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { return OperatorType::validate(a, b, c, output, gemm_info); } @@ -108,7 +120,7 @@ void CLGEMMLowpMatrixMultiplyCore::run() void CLGEMMLowpMatrixMultiplyCore::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index 6feed0d713..3dd8c5f101 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -40,27 +40,33 @@ namespace arm_compute { struct CLGEMMLowpOutputStage::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *bias{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *bias{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; }; -CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() - : _impl(std::make_unique()) +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique()) { } -CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default; CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default; -void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); } -void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -69,11 +75,15 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info); - _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } }; + _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), + info); + _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}}; } -Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info); } diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp index 033c117cec..2610cb1a3b 100644 --- a/src/runtime/CL/functions/CLGather.cpp +++ b/src/runtime/CL/functions/CLGather.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLGather.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/kernels/CLGatherKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLGatherKernel.h" namespace arm_compute { @@ -35,7 +35,11 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGather::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp index 9cb7d618cf..b2c1d2631e 100644 --- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp +++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" #include "src/core/CL/kernels/CLPadLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr memory_manager) @@ -71,48 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptrinfo(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); - _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -126,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -140,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context CLTensor *anchors_to_use = &_all_anchors; CLTensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -163,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -183,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -195,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _num_valid_proposals = num_valid_proposals; _memory_group.manage(&_proposals_4_roi_values); - _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, - BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height())); + _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, + &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, + BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height())); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); _all_proposals_to_use->allocator()->allocate(); _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -216,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -227,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -309,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -356,7 +412,7 @@ void CLGenerateProposalsLayer::run() CLScheduler::get().enqueue(*_compute_anchors_kernel, false); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -364,7 +420,7 @@ void CLGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors->run(); _dequantize_deltas->run(); @@ -373,7 +429,7 @@ void CLGenerateProposalsLayer::run() // Build the boxes CLScheduler::get().enqueue(*_bounding_box_kernel, false); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals->run(); } diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp index 90af36aa77..1a2369c5c2 100644 --- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp @@ -26,36 +26,45 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/gpu/cl/operators/ClIndirectConv2d.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClIndirectConv2d.h" namespace arm_compute { struct CLIndirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() - : _impl(std::make_unique()) +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique()) { } -CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default; CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer() = default; -void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLIndirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); @@ -65,10 +74,15 @@ void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_conte _impl->biases = biases; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info); @@ -83,4 +97,4 @@ void CLIndirectConvolutionLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp index 5feafe19db..0e994e1aee 100644 --- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp @@ -27,50 +27,62 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT - : _inst_norm_kernel(), - _mean_var_kernel(), - _mean_var_tensor(), - _ctx(ctx) + : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx) { } CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer() { } -void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure( + ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) { configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision); } -void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision); auto w = std::make_unique(); w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision); _mean_var_kernel = std::move(w); auto k = std::make_unique(); - k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + k->configure(compile_context, input, &_mean_var_tensor, output, + InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); _inst_norm_kernel = std::move(k); _mean_var_tensor.allocator()->allocate(); } -Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { - return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + return CLInstanceNormalizationLayerKernel::validate( + input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); } void CLInstanceNormalizationLayer::run() { - ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured"); + ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, + "The child class didn't set the CL kernel or function isn't configured"); schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get()); schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get()); } diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp index 1278385f53..4fe1d9b20b 100644 --- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace @@ -57,7 +57,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon); } -void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayer::configure( + const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) { ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon); @@ -86,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index ea08beca75..3b50234c77 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -24,15 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/gpu/cl/kernels/ClTransposeKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/kernels/ClTransposeKernel.h" namespace arm_compute { @@ -40,54 +40,155 @@ using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::info_helpers; CLLSTMLayer::CLLSTMLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), - _transpose_cell_state(std::make_unique()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), - _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), - _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), - _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), - _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), - _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), - _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), - _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), - _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), - _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false) + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(std::make_unique()), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _ones_fill(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), + _is_layer_norm_lstm(false) { } CLLSTMLayer::~CLLSTMLayer() = default; -void CLLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, + cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, cell_threshold, projection_threshold); } -void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); - ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, - output, lstm_params, activation_info, cell_threshold, projection_threshold); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -96,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); // Configure block that calculates the forget gate @@ -126,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe weights_vector.emplace_back(input_to_forget_weights); weights_vector.emplace_back(recurrent_to_forget_weights); - const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); + const TensorShape weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); CLTensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_forget_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -154,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, + lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, + &_forget_layer_norm_out2, ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -178,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); CLTensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); - _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); + _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, + ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; } @@ -195,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX); @@ -203,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out3); - _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_input_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -221,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(compile_context, input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, + lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(compile_context, input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -249,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, + (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info()); _recurrent_to_cell_weights = recurrent_to_cell_weights; _memory_group.manage(&_cell_state_out3); - _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); + _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, + 0.f); _cell_state_out2.allocator()->allocate(); _memory_group.manage(&_cell_state_out4); - _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); + _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, + ConvertPolicy::SATURATE); CLTensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, + lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, + ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold)); + _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -298,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX); @@ -306,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, + (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); CLTensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), + &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -329,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(compile_context, output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, + lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, + &_output_layer_norm_out2, ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(compile_context, output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -361,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, + output_state_out_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); _cell_state_activation.allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -383,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // Vector for holding the tensors to store in scratch buffer std::vector scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -397,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe output_gate_out->allocator()->allocate(); } -Status CLLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status CLLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -438,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -459,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -470,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -488,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type()); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); std::vector inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -525,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, - -cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); // Validate output gate tmp - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - if(lstm_params.has_projection()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, + 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + if (lstm_params.has_projection()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -616,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -638,12 +829,12 @@ void CLLSTMLayer::run() _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -651,7 +842,7 @@ void CLLSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { _ones_fill.run(); _subtract_input_gate.run(); @@ -660,13 +851,13 @@ void CLLSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -679,12 +870,10 @@ void CLLSTMLayer::run() ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights); pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2); - CLScheduler::get().enqueue_op(*_transpose_cell_state, - pack, - false); + CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); @@ -695,19 +884,19 @@ void CLLSTMLayer::run() _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -718,10 +907,10 @@ void CLLSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -735,10 +924,10 @@ void CLLSTMLayer::run() void CLLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index d14c6102d5..ea64eda023 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -25,12 +25,12 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/helpers/AutoConfiguration.h" +#include "arm_compute/core/Validate.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/AutoConfiguration.h" #include @@ -46,48 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit } // namespace CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(), - _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), - _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), - _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), - _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), - _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(), - _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false) + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add_cell_state_tmps(), + _add2(), + _mul_forget_gate_cell_state(), + _mul_input_gate_input_mod_gate(), + _mul_output_state_tmp_output_gate(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state_tmp1(), + _cell_state_tmp2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), + _is_prepared(false) { } void CLLSTMLayerQuantized::configure(const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, - output_state_out); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); } -void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) +void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -95,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -124,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY); std::vector weights_vector; weights_vector.emplace_back(&_recurrent_weights); weights_vector.emplace_back(&_input_weights); - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX); _transpose_weights.configure(compile_context, &_weights, &_weights_transposed); @@ -144,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co input_vector.emplace_back(output_state_in); _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX); // Bias concatenation @@ -159,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -169,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -191,85 +280,111 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0}, + {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, + {2 * output_size, 0}, {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size}, + {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size}, + {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state_tmp1); - _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state_tmp2); - _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, + &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); - _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); + _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, + ConvertPolicy::SATURATE); _cell_state_tmp1.allocator()->allocate(); _cell_state_tmp2.allocator()->allocate(); // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, + &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -278,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co } Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8); const int input_size = input->dimension(0); @@ -299,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -343,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector weights_vector; @@ -353,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -379,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -390,7 +542,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage GEMMLowpOutputStageInfo info{}; @@ -405,68 +558,91 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -475,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -541,7 +717,7 @@ void CLLSTMLayerQuantized::run() void CLLSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp index 696191c485..ea21c54bc3 100644 --- a/src/runtime/CL/functions/CLLogicalAnd.cpp +++ b/src/runtime/CL/functions/CLLogicalAnd.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include @@ -33,7 +34,10 @@ namespace arm_compute { namespace experimental { -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); @@ -54,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors) struct CLLogicalAnd::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogicalAnd::CLLogicalAnd() - : _impl(std::make_unique()) +CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique()) { } -CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; +CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default; CLLogicalAnd::~CLLogicalAnd() = default; @@ -73,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp index a0504d7852..71f9cce54f 100644 --- a/src/runtime/CL/functions/CLLogicalNot.cpp +++ b/src/runtime/CL/functions/CLLogicalNot.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClLogicalNot.h" @@ -32,16 +33,15 @@ namespace arm_compute { struct CLLogicalNot::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogicalNot::CLLogicalNot() - : _impl(std::make_unique()) +CLLogicalNot::CLLogicalNot() : _impl(std::make_unique()) { } -CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; +CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default; CLLogicalNot::~CLLogicalNot() = default; @@ -72,4 +72,4 @@ void CLLogicalNot::run() _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp index f9a606e8a5..3db4fdae84 100644 --- a/src/runtime/CL/functions/CLLogicalOr.cpp +++ b/src/runtime/CL/functions/CLLogicalOr.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalOr.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include @@ -33,7 +34,10 @@ namespace arm_compute { namespace experimental { -void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique(); @@ -54,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors) struct CLLogicalOr::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLLogicalOr::CLLogicalOr() - : _impl(std::make_unique()) +CLLogicalOr::CLLogicalOr() : _impl(std::make_unique()) { } -CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; +CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default; CLLogicalOr::~CLLogicalOr() = default; @@ -73,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp index bef422fca1..e8bdad706b 100644 --- a/src/runtime/CL/functions/CLMatMul.cpp +++ b/src/runtime/CL/functions/CLMatMul.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLMatMul.h" + #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTypes.h" + #include "src/gpu/cl/operators/ClMatMul.h" namespace arm_compute @@ -32,23 +34,32 @@ using OperatorType = opencl::ClMatMul; struct CLMatMul::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; }; -CLMatMul::CLMatMul() - : _impl(std::make_unique()) +CLMatMul::CLMatMul() : _impl(std::make_unique()) { } CLMatMul::~CLMatMul() = default; -void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void CLMatMul::configure(ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(settings); configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info); } -void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, +void CLMatMul::configure(const CLCompileContext &compile_context, + ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); @@ -56,10 +67,14 @@ void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs _impl->op = std::make_unique(); _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info); - _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; } -Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +Status CLMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { return OperatorType::validate(lhs, rhs, output, matmul_info, act_info); } diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp index 2786d32d33..7494f379b9 100644 --- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp +++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp @@ -27,26 +27,32 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" namespace arm_compute { CLMaxUnpoolingLayer::CLMaxUnpoolingLayer() - : _fill(), - _unpooling_layer_kernel(std::make_unique()) + : _fill(), _unpooling_layer_kernel(std::make_unique()) { } CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default; -void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info); } -void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); const PixelValue zero_value(0.f); @@ -55,7 +61,10 @@ void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICL _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info); } -Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info); } diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp index a81cbca1b0..5892c0e840 100644 --- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { ARM_COMPUTE_LOG_PARAMS(input, output, epsilon); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index c0cc5184e6..f93f82f1a2 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" namespace arm_compute { @@ -50,7 +50,10 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) +void CLNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); ARM_COMPUTE_LOG_PARAMS(input, output, norm_info); @@ -58,21 +61,24 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC // Configure normalization kernel _norm_kernel->configure(compile_context, input, output, norm_info); - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, + PixelValue()); } } -Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status CLNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { return CLNormalizationLayerKernel::validate(input, output, norm_info); } void CLNormalizationLayer::run() { - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Run border handler CLScheduler::get().enqueue(*_border_handler, false); diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp index 63c9164a94..939c95bd45 100644 --- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp @@ -24,20 +24,26 @@ #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h" -#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include namespace arm_compute { -void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, std); auto k = std::make_unique(); @@ -45,8 +51,10 @@ void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_contex _kernel = std::move(k); } -Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std); } diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index 186e7b4ba2..ce6d285ebe 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPReluLayer.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/operators/ClPRelu.h" @@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu; struct CLPReluLayer::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPReluLayer::CLPReluLayer() - : _impl(std::make_unique()) +CLPReluLayer::CLPReluLayer() : _impl(std::make_unique()) { } -CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; +CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default; CLPReluLayer::~CLPReluLayer() = default; @@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output); } -void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +void CLPReluLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *alpha, + ICLTensor *output) { _impl->src_0 = input; _impl->src_1 = alpha; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info())); + _impl->op->configure(compile_context, input->info(), alpha->info(), + (output == nullptr ? input->info() : output->info())); } Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp index 0ed8f03d64..e788ded512 100644 --- a/src/runtime/CL/functions/CLPadLayer.cpp +++ b/src/runtime/CL/functions/CLPadLayer.cpp @@ -22,37 +22,38 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPadLayer.h" -#include "src/core/CL/kernels/CLPadLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" namespace arm_compute { -CLPadLayer::CLPadLayer() - : _pad_kernel(std::make_unique()), - _copy(), - _perform_pad(false) +CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique()), _copy(), _perform_pad(false) { } CLPadLayer::~CLPadLayer() = default; -void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure( + ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); - _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + _perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(_perform_pad) + if (_perform_pad) { _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); } @@ -62,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i _copy.configure(compile_context, input, output); } } -Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { - bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + bool perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(perform_pad) + if (perform_pad) { ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode)); } @@ -81,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, } void CLPadLayer::run() { - if(_perform_pad) + if (_perform_pad) { CLScheduler::get().enqueue(*_pad_kernel); } diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp index a56afff7df..7f97eed98a 100644 --- a/src/runtime/CL/functions/CLPermute.cpp +++ b/src/runtime/CL/functions/CLPermute.cpp @@ -27,22 +27,21 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClPermute.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClPermute.h" namespace arm_compute { struct CLPermute::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPermute::CLPermute() - : _impl(std::make_unique()) +CLPermute::CLPermute() : _impl(std::make_unique()) { } @@ -53,7 +52,10 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu configure(CLKernelLibrary::get().get_compile_context(), input, output, perm); } -void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) +void CLPermute::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PermutationVector &perm) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, perm); diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index 9d91e58367..6aa9d9cbb3 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClMul.h" @@ -34,38 +35,55 @@ namespace arm_compute { struct CLPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPixelWiseMultiplication::CLPixelWiseMultiplication() - : _impl(std::make_unique()) +CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique()) { } -CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; +CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication::~CLPixelWiseMultiplication() = default; -void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, + rounding_policy, act_info); } -void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, + rounding_policy, act_info); } -Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } @@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run() struct CLComplexPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() - : _impl(std::make_unique()) +CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique()) { } CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; - -void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +CLComplexPixelWiseMultiplication & +CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; +CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; + +void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClComplexMul::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp index 11ae1d0fe6..ce1092a7cc 100644 --- a/src/runtime/CL/functions/CLPooling3dLayer.cpp +++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClPool3d.h" @@ -32,14 +33,13 @@ namespace arm_compute { struct CLPooling3dLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPooling3dLayer::CLPooling3dLayer() - : _impl(std::make_unique()) +CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique()) { } CLPooling3dLayer::~CLPooling3dLayer() = default; @@ -49,7 +49,10 @@ void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, cons configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info); } -void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info) +void CLPooling3dLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Pooling3dLayerInfo &pool_info) { _impl->src = input; _impl->dst = output; @@ -58,7 +61,8 @@ void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const _impl->op->configure(compile_context, input->info(), output->info(), pool_info); } -Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +Status +CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) { return opencl::ClPool3d::validate(input, output, pool_info); } diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index 0ebce318fa..65e53b9be3 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClPool2d.h" @@ -32,34 +33,44 @@ namespace arm_compute { struct CLPoolingLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr op{nullptr}; }; -CLPoolingLayer::CLPoolingLayer() - : _impl(std::make_unique()) +CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique()) { } CLPoolingLayer::~CLPoolingLayer() = default; -void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices); } -void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { _impl->src = input; _impl->dst = output; _impl->indices = indices; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); + _impl->op->configure(compile_context, input->info(), output->info(), pool_info, + (indices) ? indices->info() : nullptr); } -Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CLPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return opencl::ClPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp index 019f0a7e61..cfd0ec4fbf 100644 --- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp +++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp @@ -29,31 +29,40 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" using namespace arm_compute; -CLPriorBoxLayer::CLPriorBoxLayer() - : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) +CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) { } -void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info); } -void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); - _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float)); - _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float)); - if(!info.max_sizes().empty()) + _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.min_sizes().size() * sizeof(float)); + _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.aspect_ratios().size() * sizeof(float)); + if (!info.max_sizes().empty()) { - _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float)); + _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.max_sizes().size() * sizeof(float)); } auto k = std::make_unique(); @@ -61,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I _kernel = std::move(k); } -Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return CLPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index 7fbb866fa9..12f6f89290 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -26,29 +26,36 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { using namespace arm_compute::utils::info_helpers; using namespace arm_compute::opencl::kernels; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -78,14 +85,12 @@ void CLQLSTMLayer::TensorCopyKernel::run() _src->map(q, true); _dst->map(q, true); - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); _src->unmap(q); _dst->unmap(q); @@ -104,7 +109,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr memory_manager) _layer_norms(), _copy_output() { - for(auto &norm : _layer_norms) + for (auto &norm : _layer_norms) { norm = std::make_unique(); } @@ -129,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } -void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, - CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, + CLGEMMLowpMatrixMultiplyCore &mm, + CLGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ICLTensor *mm_input, + const ICLTensor *mm_weights, + const ICLTensor *bias, + CLTensor *mm_res, + CLTensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -151,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void CLQLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams &lstm_params) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, - cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, + output_state_in, cell_state_out, output_state_out, output, lstm_params); } -void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, @@ -191,11 +222,11 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info)); const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -216,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -238,53 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), + _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true)); - _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), + _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), + _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { - _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction->configure( + compile_context, _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, + &_projection_eff_bias, ConvertPolicy::SATURATE); } } // Pre-transpose weights to be used in GEMM. - _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed); - _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed); - _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); - _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, + &_input_to_forget_weights_transposed); + _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, + &_input_to_cell_weights_transposed); + _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, + &_input_to_output_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, + &_recurrent_to_cell_weights_transposed); + _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed); } @@ -297,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); - _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, + &_cell_to_forget_outstage_res, gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, + &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res); _recurrent_to_forget_outstage_res.allocator()->allocate(); @@ -345,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, - mm_out_info, cell_outstage_info); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); - - _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, + &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res, + &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); + + _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res); _recurrent_to_cell_outstage_res.allocator()->allocate(); @@ -378,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -393,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, - ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, + &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, + &_cell_to_input_outstage_res, gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } CLTensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res); _recurrent_to_input_outstage_res.allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); - _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); + _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, + ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(compile_context, cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); - _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, + &_cell_to_output_outstage_res, gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } CLTensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res); _recurrent_to_output_outstage_res.allocator()->allocate(); @@ -503,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -525,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -536,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ICLTensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -565,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, + accumulate_destination, ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -600,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _copy_output.configure(compile_context, output_state_out, output); } -Status CLQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status CLQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -622,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -647,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -674,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -682,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, - &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_forget_weights->data_type(), + input_to_forget_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); // Validate weights transpose ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed)); @@ -717,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -738,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -770,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -791,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -877,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -980,14 +1210,14 @@ void CLQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget)); } @@ -1002,7 +1232,7 @@ void CLQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell)); } @@ -1010,7 +1240,7 @@ void CLQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -1022,14 +1252,14 @@ void CLQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input)); } @@ -1041,7 +1271,7 @@ void CLQLSTMLayer::run() _pixelwise_mul_forget_cell.run(); _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1052,14 +1282,14 @@ void CLQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output)); } @@ -1072,31 +1302,31 @@ void CLQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1108,7 +1338,7 @@ void CLQLSTMLayer::run() void CLQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Pre-transpose weights to be used in GEMM. _input_to_forget_weights_transposed.allocator()->allocate(); @@ -1125,10 +1355,11 @@ void CLQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { _ones.map(true); - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); _ones.unmap(); } else @@ -1136,10 +1367,12 @@ void CLQLSTMLayer::prepare() _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } }; + ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights}, + {ACL_DST, &_input_to_input_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); - ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } }; + ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights}, + {ACL_DST, &_recurrent_to_input_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); @@ -1156,30 +1389,35 @@ void CLQLSTMLayer::prepare() _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } }; + ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights}, + {ACL_DST, &_input_to_forget_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); - ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } }; + ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights}, + {ACL_DST, &_recurrent_to_forget_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); - ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } }; + ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); - ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } }; + ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights}, + {ACL_DST, &_recurrent_to_cell_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); - ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } }; + ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights}, + {ACL_DST, &_input_to_output_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); - ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } }; + ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights}, + {ACL_DST, &_recurrent_to_output_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); - if(_has_projection) + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } }; + ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}}; CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); - if(_projection_bias != nullptr) + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1189,7 +1427,7 @@ void CLQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp index b249bdd1db..6edef29992 100644 --- a/src/runtime/CL/functions/CLQuantizationLayer.cpp +++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClQuantize.h" @@ -32,13 +33,12 @@ namespace arm_compute { struct CLQuantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLQuantizationLayer::CLQuantizationLayer() - : _impl(std::make_unique()) +CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique()) { } CLQuantizationLayer::~CLQuantizationLayer() = default; diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 6f122866b2..34b78eefa7 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -28,24 +28,37 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_kernel(), + _activation(), + _fully_connected_kernel(), + _copy(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } CLRNNLayer::~CLRNNLayer() = default; -Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status CLRNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -63,28 +76,42 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, +void CLRNNLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, + output, info); } -void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, - ICLTensor *hidden_state, - ICLTensor *output, ActivationLayerInfo &info) +void CLRNNLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, + ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); @@ -133,7 +160,7 @@ void CLRNNLayer::run() void CLRNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected_kernel.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp index 867ef7c7ac..1939d1d0ba 100644 --- a/src/runtime/CL/functions/CLROIAlignLayer.cpp +++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp @@ -24,26 +24,36 @@ #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h" #include "arm_compute/core/CL/ICLArray.h" -#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" -#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" namespace arm_compute { -Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp index 239a1c6bb2..0d2eab0c76 100644 --- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp @@ -22,24 +22,35 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" + #include "arm_compute/core/CL/ICLArray.h" -#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" using namespace arm_compute; -Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp index 3fbbd5f952..5c3f7f9c8c 100644 --- a/src/runtime/CL/functions/CLRange.cpp +++ b/src/runtime/CL/functions/CLRange.cpp @@ -27,9 +27,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLRangeKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLRangeKernel.h" using namespace arm_compute; @@ -38,7 +38,8 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRange::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { ARM_COMPUTE_LOG_PARAMS(output, start, end, step); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index cddbf77d7c..6c6daff5ba 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -27,23 +27,25 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -51,29 +53,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -87,8 +89,9 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); - if(requant) + const bool requant = + is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); + if (requant) { TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32)); CLDequantizationLayer::validate(input, &input_no_quant); @@ -98,10 +101,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } return Status{}; } -} +} // namespace CLReduceMean::CLReduceMean(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(), + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _dequant(), + _requant(), + _reduction_ops(), + _keep_dims(), + _do_requant(), + _input_no_quant(), _output_no_quant() { } @@ -111,17 +123,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output); } -void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) +void CLReduceMean::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const Coordinates &reduction_axis, + bool keep_dims, + ICLTensor *output) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info(); + _do_requant = is_data_type_quantized(input->info()->data_type()) && + input->info()->quantization_info() != output->info()->quantization_info(); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); @@ -129,7 +147,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor ICLTensor *tmp_input = input; ICLTensor *tmp_output = output; - if(_do_requant) + if (_do_requant) { _memory_group.manage(&_input_no_quant); _memory_group.manage(&_output_no_quant); @@ -148,46 +166,51 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { - _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], + ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), + tmp_input->info()->data_type(), + tmp_input->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); - _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], + ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!_keep_dims) + if (!_keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); } auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output); } - if(_do_requant) + if (_do_requant) { _requant.configure(compile_context, &_output_no_quant, output); _input_no_quant.allocator()->allocate(); @@ -195,7 +218,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor } } -Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status CLReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -204,19 +230,19 @@ void CLReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_requant) + if (_do_requant) { _dequant.run(); } - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } - if(_do_requant) + if (_do_requant) { _requant.run(); } diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index cdc7fec51b..ba5489018e 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -27,35 +27,43 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLReductionOperation::CLReductionOperation(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false) + : _memory_group(std::move(memory_manager)), + _unreshaped_output(), + _reduction_kernel(), + _reshape(), + _reduction_axis(), + _is_reshape_required(false) { } CLReductionOperation::~CLReductionOperation() = default; -Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status CLReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const bool is_reshape_required = !keep_dims; - if(is_reshape_required && output->total_size() != 0) + if (is_reshape_required && output->total_size() != 0) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } @@ -67,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_qinfo = input->quantization_info(); const auto output_data_type = output->data_type(); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; - if(is_reshape_required) + if (is_reshape_required) { auto shape_before_reshape = input_shape; shape_before_reshape.set(axis, 1); - initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); + initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, + input_qinfo); output_internal = &output_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output)); } @@ -92,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output) { - if(!_is_reshape_required) + if (!_is_reshape_required) { return output; } @@ -103,12 +112,18 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor return &_unreshaped_output; } -void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure( + ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims); } -void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op, + bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); @@ -117,11 +132,17 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC auto *output_internal = configure_intermediate_result_vector(input, output); - if(_is_reshape_required) + if (_is_reshape_required) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = input->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _memory_group.manage(&_unreshaped_output); } @@ -129,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC _reduction_kernel = std::make_unique(); _reduction_kernel->configure(compile_context, input, output_internal, axis, op); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(compile_context, &_unreshaped_output, output); _unreshaped_output.allocator()->allocate(); @@ -142,7 +163,7 @@ void CLReductionOperation::run() CLScheduler::get().enqueue(*_reduction_kernel, false); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp index 15de959225..156e9b90c1 100644 --- a/src/runtime/CL/functions/CLReorgLayer.cpp +++ b/src/runtime/CL/functions/CLReorgLayer.cpp @@ -27,9 +27,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include @@ -40,7 +40,10 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + int32_t stride) { ARM_COMPUTE_LOG_PARAMS(input, output, stride); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp index c51a3298c1..3d6349fb25 100644 --- a/src/runtime/CL/functions/CLReshapeLayer.cpp +++ b/src/runtime/CL/functions/CLReshapeLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClReshape.h" @@ -35,17 +36,16 @@ namespace arm_compute { struct CLReshapeLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLReshapeLayer::CLReshapeLayer() - : _impl(std::make_unique()) +CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique()) { } -CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; +CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default; CLReshapeLayer::~CLReshapeLayer() = default; @@ -78,4 +78,4 @@ void CLReshapeLayer::run() _impl->op->run(pack); } } // namespace arm_compute -/** [CLReshapeLayer snippet] **/ \ No newline at end of file + /** [CLReshapeLayer snippet] **/ diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp index 1fc93571d9..415de52e64 100644 --- a/src/runtime/CL/functions/CLReverse.cpp +++ b/src/runtime/CL/functions/CLReverse.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLReverse.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLReverseKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReverseKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); } -void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverse::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis) { ARM_COMPUTE_LOG_PARAMS(input, output, axis); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index 5b78989bfa..abff0724e4 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClScale.h" @@ -33,13 +34,12 @@ namespace arm_compute { struct CLScale::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLScale::CLScale() - : _impl(std::make_unique()) +CLScale::CLScale() : _impl(std::make_unique()) { } CLScale::~CLScale() = default; @@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info) +void CLScale::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ScaleKernelInfo &info) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp index c4ab3dc67a..b4897d9e62 100644 --- a/src/runtime/CL/functions/CLSelect.cpp +++ b/src/runtime/CL/functions/CLSelect.cpp @@ -25,9 +25,9 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSelectKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSelectKernel.h" using namespace arm_compute; @@ -38,7 +38,11 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output); } -void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelect::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(c, x, y, output); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp index 7e3ac7d769..f79c6a1235 100644 --- a/src/runtime/CL/functions/CLSlice.cpp +++ b/src/runtime/CL/functions/CLSlice.cpp @@ -26,15 +26,19 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" -#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); @@ -47,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn _kernel = std::move(k); } -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -66,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct CLSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLSlice::CLSlice() - : _impl(std::make_unique()) +CLSlice::CLSlice() : _impl(std::make_unique()) { } -CLSlice::CLSlice(CLSlice &&) = default; +CLSlice::CLSlice(CLSlice &&) = default; CLSlice &CLSlice::operator=(CLSlice &&) = default; CLSlice::~CLSlice() = default; -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::CLSlice::validate(input, output, starts, ends); } @@ -89,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends); } -void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp index d52352fc8d..2e70e2aa08 100644 --- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp +++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" + #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" #include "src/gpu/cl/operators/ClPermute.h" @@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax; template struct CLSoftmaxLayerGeneric::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric::configure(const ICLTensor *input, ICLTensor } template -void CLSoftmaxLayerGeneric::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) +void CLSoftmaxLayerGeneric::configure( + const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) { _impl->src = input; _impl->dst = output; _impl->op = std::make_unique(); - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis}; _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template -Status CLSoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +CLSoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis}; return OperatorType::validate(*input, *output, softmax_info); } template -void CLSoftmaxLayerGeneric::run() +void CLSoftmaxLayerGeneric::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp index 3b7083400b..37f728895f 100644 --- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp @@ -29,71 +29,100 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" namespace arm_compute { CLSpaceToBatchLayer::CLSpaceToBatchLayer() - : _space_to_batch_kernel(std::make_unique()), - _fill(), - _has_padding(false) + : _space_to_batch_kernel(std::make_unique()), _fill(), _has_padding(false) { } CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default; -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, + output); } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); - ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -101,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void CLSpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { //CLScheduler::get().enqueue(*_fill, true); _fill.run(); diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp index 67dafff47f..22695c9ef3 100644 --- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp @@ -29,14 +29,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" namespace arm_compute { -CLSpaceToDepthLayer::CLSpaceToDepthLayer() - : _space_to_depth_kernel(std::make_unique()) +CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique()) { } @@ -47,7 +46,10 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); _space_to_depth_kernel->configure(compile_context, input, output, block_shape); diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp index 0b27371e3f..6be43cc5cd 100644 --- a/src/runtime/CL/functions/CLSplit.cpp +++ b/src/runtime/CL/functions/CLSplit.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" namespace arm_compute @@ -38,7 +39,7 @@ void CLSplit::run() { cl::CommandQueue q = CLScheduler::get().queue(); - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp index 6a335da00c..c15496fc31 100644 --- a/src/runtime/CL/functions/CLStackLayer.cpp +++ b/src/runtime/CL/functions/CLStackLayer.cpp @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include - #include "arm_compute/runtime/CL/functions/CLStackLayer.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -32,16 +30,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLStackLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStackLayerKernel.h" + +#include namespace arm_compute { CLStackLayer::CLStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } @@ -52,7 +50,10 @@ void CLStackLayer::configure(const std::vector &input, int axis, IC configure(CLKernelLibrary::get().get_compile_context(), input, axis, output); } -void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector &input, int axis, ICLTensor *output) +void CLStackLayer::configure(const CLCompileContext &compile_context, + const std::vector &input, + int axis, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, axis, output); _num_inputs = input.size(); @@ -61,7 +62,7 @@ void CLStackLayer::configure(const CLCompileContext &compile_context, const std: // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { _stack_kernels.emplace_back(std::make_unique()); _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output); @@ -79,7 +80,7 @@ Status CLStackLayer::validate(const std::vector &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -92,7 +93,7 @@ Status CLStackLayer::validate(const std::vector &input, int axis, void CLStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { CLScheduler::get().enqueue(*_stack_kernels[i], false); } diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp index 261bdc13d1..c1953cc415 100644 --- a/src/runtime/CL/functions/CLStridedSlice.cpp +++ b/src/runtime/CL/functions/CLStridedSlice.cpp @@ -25,17 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto k = std::make_unique(); @@ -43,9 +49,14 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IT _kernel = std::move(k); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -53,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct CLStridedSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr op{nullptr}; }; -CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) - : _impl(std::make_unique()) +CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique()) { _impl->ctx = ctx; } -CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; +CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default; CLStridedSlice::~CLStridedSlice() = default; -void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -86,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, + end_mask, shrink_axis_mask); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } void CLStridedSlice::run() diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp index ef790995f9..4f86c4adfa 100644 --- a/src/runtime/CL/functions/CLTile.cpp +++ b/src/runtime/CL/functions/CLTile.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLTile.h" -#include "src/core/CL/kernels/CLTileKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLTileKernel.h" namespace arm_compute { @@ -34,7 +33,10 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTile::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { ARM_COMPUTE_LOG_PARAMS(input, output, multiples); auto k = std::make_unique(); diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp index e63c92eeb4..5a738f47ce 100644 --- a/src/runtime/CL/functions/CLTranspose.cpp +++ b/src/runtime/CL/functions/CLTranspose.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClTranspose.h" @@ -34,12 +35,11 @@ namespace arm_compute { struct CLTranspose::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -CLTranspose::CLTranspose() - : _impl(std::make_unique()) +CLTranspose::CLTranspose() : _impl(std::make_unique()) { } CLTranspose::~CLTranspose() = default; @@ -70,4 +70,4 @@ void CLTranspose::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp index 98d47810ab..ddd83e7824 100644 --- a/src/runtime/CL/functions/CLUnstack.cpp +++ b/src/runtime/CL/functions/CLUnstack.cpp @@ -40,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -56,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace CLUnstack::CLUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } @@ -66,15 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis); } -void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector &output_vector, int axis) +void CLUnstack::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const std::vector &output_vector, + int axis) { ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis); std::vector outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ICLTensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -87,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, (1 << axis_u)); } } @@ -106,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector output_vector.size()); Coordinates slice_start; int32_t slice_end_mask; - for(size_t k = 0; k < num_slices; ++k) + for (size_t k = 0; k < num_slices; ++k) { slice_start.set(wrap_axis(axis, input), k); setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void CLUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index b416d0fcf1..645f817030 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClWinogradConv2d.h" @@ -35,15 +36,15 @@ namespace arm_compute { struct CLWinogradConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; MemoryGroup memory_group{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr memory_manager) @@ -54,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptrsrc = input; _impl->weights = weights; @@ -70,20 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, + enable_fast_math); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, _impl->src }, - { TensorType::ACL_SRC_1, _impl->weights }, - { TensorType::ACL_SRC_2, _impl->biases }, - { TensorType::ACL_DST, _impl->dst } - }; - _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src}, + {TensorType::ACL_SRC_1, _impl->weights}, + {TensorType::ACL_SRC_2, _impl->biases}, + {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } @@ -97,7 +114,7 @@ void CLWinogradConvolutionLayer::run() void CLWinogradConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -107,4 +124,4 @@ void CLWinogradConvolutionLayer::prepare() _impl->is_prepared = true; } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp index 18ade97885..4270165ab4 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,109 +44,109 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Bifrost architectures - static std::map gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G71 configurations - static std::map gemm_g71_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_g71_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G52 configurations - static std::map gemm_g52_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_g52_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G76 configurations - static std::map gemm_g76_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map gemm_g76_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { case GPUTarget::G71: - if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) + if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) { - return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G76: - if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) + if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) { - return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G52: - if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) + if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) { - return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE; - if(is_rhs_constant) + if (is_rhs_constant) { - if((m > 1) && (n < 16)) + if ((m > 1) && (n < 16)) { gemm_type = CLGEMMKernelType::RESHAPED; } - else if(m == 1) + else if (m == 1) { gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if((k > 256) && (m > 4)) + if ((k > 256) && (m > 4)) { constexpr float alpha = 3.2f; constexpr float fact0 = 1.51f; constexpr float fact1 = 1.66f; constexpr float ops = 12.0f; const float scale = k > 1024 ? 1.07f : 1.0f; - gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::RESHAPED_ONLY_RHS; + gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) + ? CLGEMMKernelType::RESHAPED + : CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { @@ -156,19 +156,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned const auto workload = static_cast((m * n) / 20.0f); - gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED : gemm_type; + gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED + : gemm_type; } return gemm_type; } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -183,11 +185,12 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(k <= 496) + if (k <= 496) { - if(n <= 544) + if (n <= 544) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } else { - if(k <= 588) + if (k <= 588) { - if(k <= 552) + if (k <= 552) { - if(m <= 148) + if (m <= 148) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 278) + if (m <= 278) { return CLGEMMKernelType::RESHAPED; } @@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int const float r_nk = static_cast(n) / static_cast(k); const float r_mnk = static_cast(m) / (static_cast(n) * static_cast(k)); - if(r_mn <= 1.5469f) + if (r_mn <= 1.5469f) { - if(r_mk <= 0.8766f) + if (r_mk <= 0.8766f) { - if(r_mk <= 0.0211f) + if (r_mk <= 0.0211f) { - if(r_mnk <= 77.5833f) + if (r_mnk <= 77.5833f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.0832f) + if (r_nk <= 0.0832f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mnk <= 193.0000f) + if (r_mnk <= 193.0000f) { - if(r_mn <= 0.9948f) + if (r_mn <= 0.9948f) { - if(r_mk <= 2.5453f) + if (r_mk <= 2.5453f) { return CLGEMMKernelType::RESHAPED; } @@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 17.7370f) + if (r_mn <= 17.7370f) { - if(r_mnk <= 1391.2875f) + if (r_mnk <= 1391.2875f) { - if(r_mk <= 2.9724f) + if (r_mk <= 2.9724f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mnk <= 470.0000f) + if (r_mnk <= 470.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.1381f) + if (r_nk <= 0.1381f) { - if(r_mnk <= 9040.5000f) + if (r_mnk <= 9040.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 5.6790f) + if (r_mn <= 5.6790f) { return CLGEMMKernelType::RESHAPED; } @@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int const float r_mn = static_cast(m) / static_cast(n); const float r_nk = static_cast(n) / static_cast(k); - if(k <= 212) + if (k <= 212) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_nk <= 0.4990234375f) + if (r_nk <= 0.4990234375f) { - if(k <= 1392) + if (k <= 1392) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 325) + if (m <= 325) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } else { - if(k <= 471) + if (k <= 471) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mn <= 0.04475911520421505f) + if (r_mn <= 0.04475911520421505f) { return CLGEMMKernelType::RESHAPED; } @@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 127.0000f) + if (n <= 127.0000f) { - if(n <= 63.5000f) + if (n <= 63.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 3616.0000f) + if (m <= 3616.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 2970.5000f) + if (m <= 2970.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 12.5000f) + if (m <= 12.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 490.0000f) + if (m <= 490.0000f) { - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 226.0000f) + if (m <= 226.0000f) { - if(n <= 140.0000f) + if (n <= 140.0000f) { - if(m <= 179.5000f) + if (m <= 179.5000f) { return CLGEMMKernelType::RESHAPED; } @@ -556,15 +563,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); ARM_COMPUTE_UNUSED(n); ARM_COMPUTE_UNUSED(k); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp index ef30b28f96..673038a8db 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -35,8 +36,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Configurations for Midgard architectures - static std::map gemm_configs = - { - { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 } - }; + static std::map gemm_configs = { + {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}}; const DataType data_type = params.data_type; - if(gemm_configs.find(data_type) != gemm_configs.end()) + if (gemm_configs.find(data_type) != gemm_configs.end()) { return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); } @@ -68,7 +67,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec ARM_COMPUTE_ERROR("Not supported data type"); } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); @@ -76,7 +76,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); @@ -84,7 +85,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant); diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp index 9e779d3752..851e23bc84 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,135 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Valhall architectures - static std::map gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G77 configurations - static std::map gemm_g77_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g77_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G78 configurations - static std::map gemm_g78_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g78_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G710 and Mali-G610 configurations - static std::map gemm_g710_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g710_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G715 and Mali-G615 configurations - static std::map gemm_g715_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map gemm_g715_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { case GPUTarget::G710: case GPUTarget::G610: - if(gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) + if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) { - return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G715: case GPUTarget::G615: - if(gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) + if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) { - return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G78: - if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) + if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) { - return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G77: - if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) + if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) { - return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -182,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 471.0000f) + if (k <= 471.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 72.5000f) + if (m <= 72.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 90.5000f) + if (m <= 90.5000f) { return CLGEMMKernelType::RESHAPED; } else { - if(k <= 2448.0000f) + if (k <= 2448.0000f) { - if(n <= 756.0000f) + if (n <= 756.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -241,11 +243,12 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } @@ -253,9 +256,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int return CLGEMMKernelType::RESHAPED_ONLY_RHS; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return default_f32(m, n, k, b, is_rhs_constant); } @@ -263,7 +267,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int unsigned int best_m0; unsigned int best_n0; - if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) { return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; } @@ -273,9 +277,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return g78_f16(m, n, k, b, is_rhs_constant); } @@ -283,7 +288,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int unsigned int best_m0; unsigned int best_n0; - if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) { return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; } diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h index 6189a324cf..c528dbcac4 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h +++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h @@ -25,6 +25,7 @@ #define SRC_CLGEMMKERNELSELECTION_H #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + #include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h" @@ -45,7 +46,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique(gpu); diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp index b06c3b0f8e..8df57197e2 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" #include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h" @@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_ bool valid = false; CLGEMMKernelType gemm_type{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", + to_string(gemm_type).c_str()); } else { @@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr gemm_config = + ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query) @@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshapedOnlyRHS config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, - config.export_cl_image); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, + !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query) { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr gemm_config = + ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) @@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshaped config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs, - config.transpose_rhs, config.export_cl_image); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, + config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) @@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) std::unique_ptr gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) @@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigNative config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); + std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } } // namespace auto_heuristics } // namespace cl_gemm -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h index 020237b7f4..f544715e03 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h @@ -50,8 +50,7 @@ struct CommonQuery /** Result of querying about GEMM type ( @ref CLGEMMKernelType) */ struct GEMMTypeResult { - GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) - : valid{ valid }, gemm_type{ gemm_type } + GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type} { } /** Test if the result is valid */ @@ -67,7 +66,7 @@ struct GEMMTypeResult struct GEMMConfigResult { GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info) - : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info } + : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info} { } /** Test if the result is valid */ @@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query); } // namespace cl_gemm } // namespace arm_compute -#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H \ No newline at end of file +#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h index c451bd9062..08a7ee8c18 100644 --- a/src/runtime/CL/mlgo/Common.h +++ b/src/runtime/CL/mlgo/Common.h @@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType; /** GEMM Configuration for Native kernel */ struct GEMMConfigNative { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ }; /** GEMM Configuration for Reshaped Only RHS kernel */ struct GEMMConfigReshapedOnlyRHS { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; /** GEMM Configuration for Reshaped kernel */ struct GEMMConfigReshaped { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ false }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; } // namespace mlgo } // namespace arm_compute -#endif // SRC_RUNTIME_CL_MLGO_COMMON_H \ No newline at end of file +#endif // SRC_RUNTIME_CL_MLGO_COMMON_H diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp index 1c75cdc427..f7b706902b 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.cpp +++ b/src/runtime/CL/mlgo/HeuristicTree.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/HeuristicTree.h" + #include "arm_compute/core/Log.h" #include "support/Cast.h" @@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond) // PRE: all features and ConditionalOps are valid constexpr float eps = 0.0001f; // Calculate all secondary features - std::vector> cond_values - { - { "m", static_cast(shape.m) }, - { "n", static_cast(shape.n) }, - { "k", static_cast(shape.k) }, - { "b", static_cast(shape.b) }, - { "r_mn", static_cast(shape.m) / shape.n }, - { "r_mk", static_cast(shape.m) / shape.k }, - { "r_nk", static_cast(shape.n) / shape.k }, - { "r_mnk", static_cast(shape.m) / (static_cast(shape.n) / shape.k) }, - { "workload", (static_cast(shape.m) * shape.n * shape.b) / 20.0 } - }; - auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(), - [&cond](decltype(*cond_values.begin()) it) - { - return it.first == cond.feature; - }); + std::vector> cond_values{ + {"m", static_cast(shape.m)}, + {"n", static_cast(shape.n)}, + {"k", static_cast(shape.k)}, + {"b", static_cast(shape.b)}, + {"r_mn", static_cast(shape.m) / shape.n}, + {"r_mk", static_cast(shape.m) / shape.k}, + {"r_nk", static_cast(shape.n) / shape.k}, + {"r_mnk", static_cast(shape.m) / (static_cast(shape.n) / shape.k)}, + {"workload", (static_cast(shape.m) * shape.n * shape.b) / 20.0}}; + auto cond_value_pair_it = + std::find_if(cond_values.begin(), cond_values.end(), + [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; }); ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end()); const float cond_value = cond_value_pair_it->second; - switch(cond.op) + switch (cond.op) { case ConditionalOp::LT: { @@ -92,13 +89,12 @@ constexpr size_t HeuristicTree::_max_num_nodes; constexpr size_t HeuristicTree::_max_query_depth; constexpr HeuristicTree::NodeID HeuristicTree::_root; -HeuristicTree::HeuristicTree() - : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) +HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) { } HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type) - : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{} + : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{} { } @@ -108,16 +104,17 @@ std::pair HeuristicTree::query(GEMMShape shape) const // Root ID = 0; auto cur_node = _tree.at(_root).get(); size_t depth = 0; - while(cur_node->type() != NodeType::Leaf) + while (cur_node->type() != NodeType::Leaf) { - if(depth > _max_query_depth) + if (depth > _max_query_depth) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", + _max_query_depth); return std::make_pair(false, T{}); } ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType"); auto br_node = utils::cast::polymorphic_downcast(cur_node); - if(evaluate(shape, br_node->condition)) + if (evaluate(shape, br_node->condition)) { cur_node = _tree.at(br_node->true_node).get(); } @@ -135,12 +132,12 @@ std::pair HeuristicTree::query(GEMMShape shape) const template bool HeuristicTree::add_leaf(NodeID id, T val) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val) bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - const std::set supported_features = - { - "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload" - }; - const auto orig_feature = cond.feature; - std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c) - { - return std::tolower(c); - }); - if(supported_features.find(cond.feature) == supported_features.end()) + const std::set supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"}; + const auto orig_feature = cond.feature; + std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), + [](char c) { return std::tolower(c); }); + if (supported_features.find(cond.feature) == supported_features.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str()); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID bool HeuristicTree::check_if_structurally_correct() const { std::set visited; - std::deque to_visit{ _root }; + std::deque to_visit{_root}; - while(!to_visit.empty()) + while (!to_visit.empty()) { auto id = to_visit.front(); to_visit.pop_front(); - if(_tree.find(id) == _tree.end()) + if (_tree.find(id) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id); return false; } auto not_seen_before = visited.insert(id); - if(!not_seen_before.second) + if (!not_seen_before.second) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops"); return false; } auto cur_node = _tree.at(id).get(); - if(cur_node->type() == NodeType::Branch) + if (cur_node->type() == NodeType::Branch) { auto br_node = utils::cast::polymorphic_downcast(cur_node); to_visit.push_back(br_node->true_node); to_visit.push_back(br_node->false_node); } } - if(visited.size() != _tree.size()) + if (visited.size() != _tree.size()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes"); return false; @@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const bool HeuristicTree::check() { - if(_tree.empty()) + if (_tree.empty()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered"); return false; } - if(_tree.find(_root) == _tree.end()) + if (_tree.find(_root) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root); return false; @@ -237,7 +229,8 @@ template std::pair HeuristicTree::query(GEMMShape shap /** Explicit template instantiation @relates HeuristicTree */ template std::pair HeuristicTree::query(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ -template std::pair HeuristicTree::query(GEMMShape shape) const; +template std::pair +HeuristicTree::query(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ template std::pair HeuristicTree::query(GEMMShape shape) const; diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h index d5c7de2215..a4f8c116b9 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.h +++ b/src/runtime/CL/mlgo/HeuristicTree.h @@ -25,6 +25,7 @@ #define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H #include "arm_compute/core/Types.h" + #include "src/runtime/CL/mlgo/Common.h" #include @@ -84,7 +85,7 @@ public: struct BranchNode : public Node { BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node) - : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node } + : id{id}, condition{cond}, true_node{t_node}, false_node{f_node} { } NodeType type() const override @@ -100,8 +101,7 @@ public: template struct LeafNode : public Node { - LeafNode(NodeID id, T val) - : id{ id }, value{ val } + LeafNode(NodeID id, T val) : id{id}, value{val} { } NodeType type() const override @@ -177,22 +177,22 @@ public: bool check(); private: - static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query - static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree - static constexpr NodeID _root{ 0 }; // Root tree ID + static constexpr size_t _max_query_depth{1000}; // Maximum depth of query + static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree + static constexpr NodeID _root{0}; // Root tree ID private: bool check_if_structurally_correct() const; private: - TreeID _id; /**< Heuristic tree ID */ - HeuristicType _heuristic_type; /**< Heuristic type */ - std::string _ip_target; /**< IP target associated with the tree */ - DataType _data_type; /**< Data type associated with the tree */ - std::map> _tree; /**< Tree representation */ + TreeID _id; /**< Heuristic tree ID */ + HeuristicType _heuristic_type; /**< Heuristic type */ + std::string _ip_target; /**< IP target associated with the tree */ + DataType _data_type; /**< Data type associated with the tree */ + std::map> _tree; /**< Tree representation */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp index 80f3bb85e9..aed46cd80f 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp +++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp @@ -24,6 +24,7 @@ #include "src/runtime/CL/mlgo/MLGOHeuristics.h" #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/MLGOParser.h" #include "src/runtime/CL/mlgo/Utils.h" @@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs) } bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, - rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == + std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, - rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, + lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs, + rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } constexpr size_t MLGOHeuristics::_max_num_trees; -MLGOHeuristics::MLGOHeuristics() - : _indices{}, _trees{}, _tree_valid{}, _valid{ false } +MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false} { } @@ -59,71 +60,74 @@ std::pair MLGOHeuristics::query_gemm_type(const Query &query) co { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str()); const auto invalid = GEMMType::RESHAPED; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } std::pair MLGOHeuristics::query_gemm_config_native(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigNative{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } std::pair MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshapedOnlyRHS{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } std::pair MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshaped{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query(shape_query); } @@ -131,14 +135,14 @@ std::pair MLGOHeuristics::query_gemm_config_reshaped(c bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) { bool status; - HeuristicTree *tree{ nullptr }; + HeuristicTree *tree{nullptr}; std::tie(status, tree) = get_heuristic_tree(id); - if(!status) + if (!status) { return status; } status = tree->check(); - if(!status) + if (!status) { return status; } @@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) bool MLGOHeuristics::check_all() const { // Tree validities are already checked and cached. - bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) - { - return !v.second; - }) - == _tree_valid.end(); - if(!all_trees_are_checked) + bool all_trees_are_checked = + std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end(); + if (!all_trees_are_checked) { - ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo"); + ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each " + "tree is completed. This could also indicate there are no trees in the dotmlgo"); return false; } @@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const std::pair MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id) { - if(_indices.find(id) == _indices.end()) + if (_indices.find(id) == _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id); return std::make_pair(false, nullptr); } const auto index = _indices[id]; - if(_trees.find(index) == _trees.end()) + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); return std::make_pair(false, nullptr); @@ -186,7 +188,7 @@ std::pair MLGOHeuristics::get_heuristic_tree(HeuristicTre bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) { - if(_indices.size() >= _max_num_trees) + if (_indices.size() >= _max_num_trees) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees); return false; @@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // PRE: correctness of t is guaranteed by the tree construction process // Ensure unique id const auto id = t.id(); - if(_indices.find(id) != _indices.end()) + if (_indices.find(id) != _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id); return false; @@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // Ensure unique index const auto index = t.index(); - if(_trees.find(index) != _trees.end()) + if (_trees.find(index) != _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists"); return false; @@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", + filename.c_str()); return _valid = false; } return reload_from_stream(fs); @@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) bool MLGOHeuristics::reload_from_stream(std::istream &in) { auto parsed = parser::parse_mlgo(in); - if(!parsed.first) + if (!parsed.first) { ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead"); return _valid = false; @@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in) } } // namespace mlgo -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h index aa21225959..6a491c5503 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.h +++ b/src/runtime/CL/mlgo/MLGOHeuristics.h @@ -135,16 +135,16 @@ public: bool check_all() const; private: - static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/ + static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/ private: // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree std::map _indices; /**< A mapping from TreeID to Index */ std::map _trees; /**< A mapping from Index to HeuristicTree */ std::map _tree_valid; /**< Result cache of the tree validity checks */ - bool _valid; /**< Overall validity */ + bool _valid; /**< Overall validity */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp index 625739e450..893daf2ed9 100644 --- a/src/runtime/CL/mlgo/MLGOParser.cpp +++ b/src/runtime/CL/mlgo/MLGOParser.cpp @@ -22,19 +22,21 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/MLGOParser.h" + #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/Utils.h" #include #define CHECK(parser_expr, valid_var) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return; #define CHECK_DEFAULT(parser_expr, valid_var, default_val) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return default_val; #ifdef ARM_COMPUTE_LOGGING_ENABLED @@ -53,8 +55,7 @@ valid_var = false; \ return default_val; -#define LOG_TOKEN_POS(tokens, pos_var) \ - const auto pos_var = tokens.current_pos(); +#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos(); #else // ARM_COMPUTE_LOGGING_ENABLED @@ -73,19 +74,12 @@ namespace { void ltrim(std::string &str) { - str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) - { - return !std::isspace(ch); - })); + str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); })); } void rtrim(std::string &str) { - str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) - { - return !std::isspace(ch); - }).base(), - str.end()); + str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end()); } void trim(std::string &str) @@ -109,7 +103,7 @@ enum class ComparatorType }; TokenStream::TokenStream(std::istream &s, const std::string &delims) - : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{} + : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{} { read(); } @@ -125,7 +119,7 @@ Token TokenStream::take() ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); Token t = _tokens.front(); _tokens.pop_front(); - if(_tokens.empty()) + if (_tokens.empty()) { read(); } @@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i) ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead"); // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end - while(_istream && _tokens.size() <= i) + while (_istream && _tokens.size() <= i) { read(); } @@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i) void advance(CharPosition &pos, char ch) { - if(ch == '\n') + if (ch == '\n') { pos.ln += 1; pos.col = 0; @@ -167,17 +161,16 @@ void TokenStream::read() do { // Reached eof - if(!_istream.get(ch)) + if (!_istream.get(ch)) { - if(!reached_end()) + if (!reached_end()) { _tokens.emplace_back(TokenType::End, "", _lookahead_pos); } return; } advance(_lookahead_pos, ch); - } - while(std::isspace(ch) || is_delim(ch)); + } while (std::isspace(ch) || is_delim(ch)); // Read chars until we hit a delim or eof auto orig_pos = _lookahead_pos; auto tok = recognize_tok(ch); @@ -190,41 +183,41 @@ void TokenStream::read() Token TokenStream::recognize_tok(char ch) { - if(ch == '[') + if (ch == '[') { - return Token{ TokenType::L_List, "", _lookahead_pos }; + return Token{TokenType::L_List, "", _lookahead_pos}; } - else if(ch == ']') + else if (ch == ']') { - return Token{ TokenType::R_List, "", _lookahead_pos }; + return Token{TokenType::R_List, "", _lookahead_pos}; } - else if(ch == '.') + else if (ch == '.') { - return float_after_dp_st(std::string{ ch }); + return float_after_dp_st(std::string{ch}); } - else if(std::isdigit(ch)) + else if (std::isdigit(ch)) { - return num_st(std::string{ ch }); + return num_st(std::string{ch}); } else { - return text_st(std::string{ ch }); + return text_st(std::string{ch}); } } Token TokenStream::num_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(ch == '.') + if (ch == '.') { return float_after_dp_st(value + ch); } - else if(!std::isdigit(ch)) + else if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value) } value += ch; } - return Token{ TokenType::Int, value, _lookahead_pos }; + return Token{TokenType::Int, value, _lookahead_pos}; } Token TokenStream::float_after_dp_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(!std::isdigit(ch)) + if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value) } value += ch; } - return Token{ TokenType::Float, value, _lookahead_pos }; + return Token{TokenType::Float, value, _lookahead_pos}; } Token TokenStream::text_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(is_delim(ch)) + if (is_delim(ch)) { break; } - if(ch == '[' || ch == ']') + if (ch == '[' || ch == ']') { rewind(_lookahead_pos); _istream.unget(); @@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value) } value += ch; } - return Token{ TokenType::Text, value, _lookahead_pos }; + return Token{TokenType::Text, value, _lookahead_pos}; } bool TokenStream::reached_end() const @@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::End) + if (tok.type != TokenType::End) { FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream"); } @@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token"); } @@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token"); } @@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); int val = CHECK_DEFAULT(int_val(in, valid), valid, 0); - if(val < 0) + if (val < 0) { FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token"); } @@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Float) + if (tok.type != TokenType::Float) { FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token"); } @@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Text || tok.value.empty()) + if (tok.type != TokenType::Text || tok.value.empty()) { FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token"); } @@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid) bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) { auto tok = in.peek(); - if(tok.type == TokenType::Text && tok.value == c_str) + if (tok.type == TokenType::Text && tok.value == c_str) { - if(take) + if (take) { in.take(); } @@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) void expect_text(TokenStream &in, const std::string &str, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_text(in, str)) + if (!accept_text(in, str)) { FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str); } @@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid) bool accept_l_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::L_List) + if (tok.type == TokenType::L_List) { in.take(); return true; @@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in) void expect_l_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_l_list(in)) + if (!accept_l_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect '['"); } @@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid) bool accept_r_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::R_List) + if (tok.type == TokenType::R_List) { in.take(); return true; @@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in) void expect_r_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_r_list(in)) + if (!accept_r_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect ']'"); } @@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid) ConditionalOp conditional_op(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "<=")) + if (accept_text(in, "<=")) { return ConditionalOp::LE; } - else if(accept_text(in, ">=")) + else if (accept_text(in, ">=")) { return ConditionalOp::GE; } - else if(accept_text(in, "==")) + else if (accept_text(in, "==")) { return ConditionalOp::EQ; } - else if(accept_text(in, "<")) + else if (accept_text(in, "<")) { return ConditionalOp::LT; } - else if(accept_text(in, ">")) + else if (accept_text(in, ">")) { return ConditionalOp::GT; } @@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid) { CHECK(expect_text(in, "ip-type", valid), valid); LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gpu")) + if (accept_text(in, "gpu")) { ; } - else if(accept_text(in, "cpu")) + else if (accept_text(in, "cpu")) { ; } @@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid) DataType data_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "f16")) + if (accept_text(in, "f16")) { return DataType::F16; } - else if(accept_text(in, "f32")) + else if (accept_text(in, "f32")) { return DataType::F32; } - else if(accept_text(in, "qasymm8")) + else if (accept_text(in, "qasymm8")) { return DataType::QASYMM8; } @@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid) ComparatorType comparator_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "var")) + if (accept_text(in, "var")) { return ComparatorType::Var; } - else if(accept_text(in, "num")) + else if (accept_text(in, "num")) { return ComparatorType::Num; } - else if(accept_text(in, "enum")) + else if (accept_text(in, "enum")) { return ComparatorType::Enum; } @@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid) HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gemm-type", take)) + if (accept_text(in, "gemm-type", take)) { return HeuristicType::GEMM_Type; } - else if(accept_text(in, "gemm-config-native", take)) + else if (accept_text(in, "gemm-config-native", take)) { return HeuristicType::GEMM_Config_Native; } - else if(accept_text(in, "gemm-config-reshaped-only-rhs", take)) + else if (accept_text(in, "gemm-config-reshaped-only-rhs", take)) { return HeuristicType::GEMM_Config_Reshaped_Only_RHS; } - else if(accept_text(in, "gemm-config-reshaped", take)) + else if (accept_text(in, "gemm-config-reshaped", take)) { return HeuristicType::GEMM_Config_Reshaped; } @@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val { LOG_TOKEN_POS(in, pos); auto ht = CHECK(heuristic_type(in, valid, false), valid); - if(ht != expected_ht) + if (ht != expected_ht) { FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type"); } @@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val GEMMType gemm_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "native")) + if (accept_text(in, "native")) { return GEMMType::NATIVE; } - else if(accept_text(in, "reshaped-only-rhs")) + else if (accept_text(in, "reshaped-only-rhs")) { return GEMMType::RESHAPED_ONLY_RHS; } - else if(accept_text(in, "reshaped")) + else if (accept_text(in, "reshaped")) { return GEMMType::RESHAPED; } @@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid) const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigNative{ m0, n0, k0 }; + return GEMMConfigNative{m0, n0, k0}; } GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid) @@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex }; + return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex}; } GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) @@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex }; + return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex}; } void gpu_priority(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "best-performance")) + if (accept_text(in, "best-performance")) { ; } - else if(accept_text(in, "best-memory-usage")) + else if (accept_text(in, "best-memory-usage")) { ; } @@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid) void gpu_behavior(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "static")) + if (accept_text(in, "static")) { ; } - else if(accept_text(in, "dynamic")) + else if (accept_text(in, "dynamic")) { ; } @@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid) void free_vars(TokenStream &in, bool &valid) { CHECK(expect_l_list(in, valid), valid); - while(!accept_r_list(in)) + while (!accept_r_list(in)) { CHECK(text_val(in, valid), valid); } @@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid) void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid) { CHECK(expect_text(in, "", valid), valid); - while(!accept_text(in, "")) + while (!accept_text(in, "")) { CHECK(heuristics_table_entry(in, h, valid), valid); } @@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid) const auto c_o = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val); const auto r_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val); const auto r_v = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val); - if(l_t != ComparatorType::Var || r_t != ComparatorType::Num) + if (l_t != ComparatorType::Var || r_t != ComparatorType::Num) { - FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); + FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, + "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); } - return Condition{ l_v, c_o, r_v }; + return Condition{l_v, c_o, r_v}; } void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) @@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) CHECK(expect_text(in, "", valid), valid); - HeuristicTree *t = nullptr; - std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); + HeuristicTree *t = nullptr; + std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); const HeuristicType t_heuristic_type = std::get<0>(t->index()); - while(!accept_text(in, "")) + while (!accept_text(in, "")) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "b")) + if (accept_text(in, "b")) { // Branch node const auto id = CHECK(uint_val(in, valid), valid); @@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) const auto f_id = CHECK(uint_val(in, valid), valid); valid = CHECK(t->add_branch(id, cond, t_id, f_id), valid); } - else if(accept_text(in, "l")) + else if (accept_text(in, "l")) { // Leaf node const auto id = CHECK(uint_val(in, valid), valid); @@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) // heuristic table). For now it remains as a step for validation. LOG_TOKEN_POS(in, pos); CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid); - switch(t_heuristic_type) + switch (t_heuristic_type) { case HeuristicType::GEMM_Type: { @@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid) MLGOHeuristics h; CHECK_DEFAULT(header(in, valid), valid, h); CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h); - while(accept_text(in, " parse_mlgo(std::istream &in) #undef CHECK #undef CHECK_DEFAULT #undef FAIL_WITH_MSG -#undef FAIL_WITH_MSG_DEFAULT \ No newline at end of file +#undef FAIL_WITH_MSG_DEFAULT diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h index 49d8b9c644..cffce8d6a1 100644 --- a/src/runtime/CL/mlgo/MLGOParser.h +++ b/src/runtime/CL/mlgo/MLGOParser.h @@ -98,15 +98,14 @@ struct CharPosition return ln == other.ln && col == other.col; } - size_t ln{ 0 }; - size_t col{ 0 }; + size_t ln{0}; + size_t col{0}; }; /** Token */ struct Token { - Token(TokenType t, std::string v, CharPosition pos) - : type{ t }, value{ v }, pos{ pos } + Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos} { } @@ -196,4 +195,4 @@ std::pair parse_mlgo(std::istream &in); } // namespace parser } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp index 81d418c28e..c7e0100b3c 100644 --- a/src/runtime/CL/mlgo/Utils.cpp +++ b/src/runtime/CL/mlgo/Utils.cpp @@ -43,40 +43,38 @@ inline std::string to_str(const T &val) std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config) { return os << "Native:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config) { return os << "ReshapedOnlyRHS:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config) { return os << "Reshaped:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "v0: " << config.v0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_lhs: " << config.interleave_lhs << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "v0: " << config.v0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_lhs: " << config.interleave_lhs << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, HeuristicType ht) { - switch(ht) + switch (ht) { case HeuristicType::GEMM_Type: { @@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht) } std::ostream &operator<<(std::ostream &os, DataType dt) { - switch(dt) + switch (dt) { case DataType::F32: { @@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos) } // namespace mlgo -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h index c634a887e9..73b537f476 100644 --- a/src/runtime/CL/mlgo/Utils.h +++ b/src/runtime/CL/mlgo/Utils.h @@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht); std::ostream &operator<<(std::ostream &os, DataType dt); std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index); std::ostream &operator<<(std::ostream &os, const Query &query); -std::string to_string(const GEMMConfigNative &config); -std::string to_string(const GEMMConfigReshapedOnlyRHS &config); -std::string to_string(const GEMMConfigReshaped &config); -std::string to_string(const Query &query); +std::string to_string(const GEMMConfigNative &config); +std::string to_string(const GEMMConfigReshapedOnlyRHS &config); +std::string to_string(const GEMMConfigReshaped &config); +std::string to_string(const Query &query); namespace parser { std::ostream &operator<<(std::ostream &os, const CharPosition &pos); @@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos); } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_UTILS_H \ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_UTILS_H diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 6f3e32491a..5e3907f1ea 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -27,20 +27,20 @@ namespace arm_compute { namespace cl_tuner { -constexpr unsigned int max_lws_supported_x{ 64u }; -constexpr unsigned int max_lws_supported_y{ 32u }; -constexpr unsigned int max_lws_supported_z{ 32u }; +constexpr unsigned int max_lws_supported_x{64u}; +constexpr unsigned int max_lws_supported_y{32u}; +constexpr unsigned int max_lws_supported_z{32u}; /** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */ class CLTuningParametersList : public ICLTuningParametersList { protected: /* Shape of 4-D search space */ - TensorShape search_space_shape{ 0, 0, 0, 0 }; - std::vector _lws_x{ 0 }; - std::vector _lws_y{ 0 }; - std::vector _lws_z{ 0 }; - std::vector _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units. + TensorShape search_space_shape{0, 0, 0, 0}; + std::vector _lws_x{0}; + std::vector _lws_y{0}; + std::vector _lws_z{0}; + std::vector _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units. The value is in the range [-31,+31]. When 0, the runtime-selected wbs used is unmodified. */ @@ -116,7 +116,8 @@ private: * @param[in] lws_max Max LWS value allowed to be tested * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one. */ - void initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); + void + initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); }; /** A minimal subset of LWS values that only have 1,2 and 4/8 */ @@ -170,9 +171,9 @@ CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDR search_space_shape[1] = lws_y_max; search_space_shape[2] = lws_z_max; search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -3, -2, -1, 0, 1, 2, 3 }; + _wbsm = {-3, -2, -1, 0, 1, 2, 3}; search_space_shape[3] = _wbsm.size(); } } @@ -194,26 +195,31 @@ CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gw _lws_x = {}; _lws_y = {}; _lws_z = {}; - initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 - initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_x, gws[0], lws_x_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_y, gws[1], lws_y_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_z, gws[2], lws_z_max, false); search_space_shape[0] = _lws_x.size(); search_space_shape[1] = _lws_y.size(); search_space_shape[2] = _lws_z.size(); search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -2, -1, 0, 1, 2 }; + _wbsm = {-2, -1, 0, 1, 2}; search_space_shape[3] = _wbsm.size(); } } -void CLTuningParametersListNormal::initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) +void CLTuningParametersListNormal::initialize_lws_values(std::vector &lws, + unsigned int gws, + unsigned int lws_max, + bool mod_let_one) { lws.push_back(1); - for(unsigned int i = 2; i <= lws_max; ++i) + for (unsigned int i = 2; i <= lws_max; ++i) { // Power of two condition const bool is_power_of_two = (i & (i - 1)) == 0; @@ -221,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws) { - switch(tuning_info.tuner_mode) + switch (tuning_info.tuner_mode) { case CLTunerMode::EXHAUSTIVE: return std::make_unique(gws, tuning_info); diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp index 45e872428f..9fbdc3a4dd 100644 --- a/src/runtime/CPP/CPPScheduler.cpp +++ b/src/runtime/CPP/CPPScheduler.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/Utility.h" + #include "support/Mutex.h" #include @@ -53,8 +54,7 @@ public: * @param[in] start First value that will be returned by the feeder * @param[in] end End condition (The last value returned by get_next() will be end - 1) */ - explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) - : _atomic_counter(start), _end(end) + explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end) { } /** Return the next element in the range if there is one. @@ -89,8 +89,7 @@ void process_workloads(std::vector &workloads, ThreadFeede { ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size()); workloads[workload_index](info); - } - while(feeder.get_next(workload_index)); + } while (feeder.get_next(workload_index)); } /** Set thread affinity. Pin current thread to a particular core @@ -99,7 +98,7 @@ void process_workloads(std::vector &workloads, ThreadFeede */ void set_thread_affinity(int core_id) { - if(core_id < 0) + if (core_id < 0) { return; } @@ -150,10 +149,10 @@ public: */ explicit Thread(int core_pin = -1); - Thread(const Thread &) = delete; + Thread(const Thread &) = delete; Thread &operator=(const Thread &) = delete; Thread(Thread &&) = delete; - Thread &operator=(Thread &&) = delete; + Thread &operator=(Thread &&) = delete; /** Destructor. Make the thread join. */ ~Thread(); @@ -196,21 +195,20 @@ public: private: std::thread _thread{}; ThreadInfo _info{}; - std::vector *_workloads{ nullptr }; - ThreadFeeder *_feeder{ nullptr }; + std::vector *_workloads{nullptr}; + ThreadFeeder *_feeder{nullptr}; std::mutex _m{}; std::condition_variable _cv{}; - bool _wait_for_work{ false }; - bool _job_complete{ true }; - std::exception_ptr _current_exception{ nullptr }; - int _core_pin{ -1 }; - std::list *_thread_pool{ nullptr }; - unsigned int _wake_beg{ 0 }; - unsigned int _wake_end{ 0 }; + bool _wait_for_work{false}; + bool _job_complete{true}; + std::exception_ptr _current_exception{nullptr}; + int _core_pin{-1}; + std::list *_thread_pool{nullptr}; + unsigned int _wake_beg{0}; + unsigned int _wake_end{0}; }; -Thread::Thread(int core_pin) - : _core_pin(core_pin) +Thread::Thread(int core_pin) : _core_pin(core_pin) { _thread = std::thread(&Thread::worker_thread, this); } @@ -218,7 +216,7 @@ Thread::Thread(int core_pin) Thread::~Thread() { // Make sure worker thread has ended - if(_thread.joinable()) + if (_thread.joinable()) { ThreadFeeder feeder; set_workload(nullptr, feeder, ThreadInfo()); @@ -257,7 +255,7 @@ void Thread::worker_thread() { set_thread_affinity(_core_pin); - while(true) + while (true) { std::unique_lock lock(_m); _cv.wait(lock, [&] { return _wait_for_work; }); @@ -266,18 +264,18 @@ void Thread::worker_thread() _current_exception = nullptr; // Exit if the worker thread has not been fed with workloads - if(_workloads == nullptr || _feeder == nullptr) + if (_workloads == nullptr || _feeder == nullptr) { return; } // Wake up more peer threads from thread pool if this job has been delegated to the current thread - if(_thread_pool != nullptr) + if (_thread_pool != nullptr) { auto thread_it = _thread_pool->begin(); std::advance(thread_it, std::min(static_cast(_thread_pool->size()), _wake_beg)); auto wake_end = std::min(_wake_end, static_cast(_info.num_threads - 1)); - for(unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it) + for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it) { thread_it->start(); } @@ -291,7 +289,7 @@ void Thread::worker_thread() #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(...) + catch (...) { _current_exception = std::current_exception(); } @@ -322,11 +320,11 @@ struct CPPScheduler::Impl final : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U) { const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE")); - if(mode_env_v == "linear") + if (mode_env_v == "linear") { _forced_mode = ModeToggle::Linear; } - else if(mode_env_v == "fanout") + else if (mode_env_v == "fanout") { _forced_mode = ModeToggle::Fanout; } @@ -350,7 +348,7 @@ struct CPPScheduler::Impl final // Set affinity on worked threads _threads.clear(); - for(auto i = 1U; i < _num_threads; ++i) + for (auto i = 1U; i < _num_threads; ++i) { _threads.emplace_back(func(i, thread_hint)); } @@ -359,20 +357,23 @@ struct CPPScheduler::Impl final void auto_switch_mode(unsigned int num_threads_to_use) { // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use - if(_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8)) + if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8)) { set_fanout_mode(m_default_wake_fanout, num_threads_to_use); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", this->wake_fanout(), num_threads_to_use); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", + this->wake_fanout(), num_threads_to_use); } else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8)) { set_linear_mode(); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", num_threads_to_use); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", + num_threads_to_use); } } void set_linear_mode() { - for(auto &thread : _threads) + for (auto &thread : _threads) { thread.set_linear_mode(); } @@ -384,14 +385,14 @@ struct CPPScheduler::Impl final ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1); const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1)); auto thread_it = _threads.begin(); - for(auto i = 1U; i < num_threads_to_use; ++i, ++thread_it) + for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it) { const auto wake_begin = i * actual_wake_fanout - 1; const auto wake_end = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1); thread_it->set_fanout_mode(&_threads, wake_begin, wake_end); } // Reset the remaining threads's wake up schedule - while(thread_it != _threads.end()) + while (thread_it != _threads.end()) { thread_it->set_fanout_mode(&_threads, 0U, 0U); ++thread_it; @@ -417,9 +418,9 @@ struct CPPScheduler::Impl final unsigned int _num_threads; std::list _threads; arm_compute::Mutex _run_workloads_mutex{}; - Mode _mode{ Mode::Linear }; - ModeToggle _forced_mode{ ModeToggle::None }; - unsigned int _wake_fanout{ 0 }; + Mode _mode{Mode::Linear}; + ModeToggle _forced_mode{ModeToggle::None}; + unsigned int _wake_fanout{0}; }; /* @@ -431,8 +432,7 @@ CPPScheduler &CPPScheduler::get() return scheduler; } -CPPScheduler::CPPScheduler() - : _impl(std::make_unique(num_threads_hint())) +CPPScheduler::CPPScheduler() : _impl(std::make_unique(num_threads_hint())) { } @@ -465,15 +465,15 @@ void CPPScheduler::run_workloads(std::vector &workloads) // This is not great because different threads workloads won't run in parallel but at least they // won't interfere each other and deadlock. arm_compute::lock_guard lock(_impl->_run_workloads_mutex); - const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast(workloads.size())); - if(num_threads_to_use < 1) + const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast(workloads.size())); + if (num_threads_to_use < 1) { return; } // Re-adjust the mode if the actual number of threads to use is different from the number of threads created _impl->auto_switch_mode(num_threads_to_use); int num_threads_to_start = 0; - switch(_impl->mode()) + switch (_impl->mode()) { case CPPScheduler::Impl::Mode::Fanout: { @@ -494,22 +494,22 @@ void CPPScheduler::run_workloads(std::vector &workloads) unsigned int t = 0; auto thread_it = _impl->_threads.begin(); // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread - for(; t < num_threads_to_use - 1; ++t, ++thread_it) + for (; t < num_threads_to_use - 1; ++t, ++thread_it) { info.thread_id = t; thread_it->set_workload(&workloads, feeder, info); } thread_it = _impl->_threads.begin(); - for(int i = 0; i < num_threads_to_start; ++i, ++thread_it) + for (int i = 0; i < num_threads_to_start; ++i, ++thread_it) { thread_it->start(); } - info.thread_id = t; // Set main thread's thread_id + info.thread_id = t; // Set main thread's thread_id std::exception_ptr last_exception = nullptr; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED try { -#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ +#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ process_workloads(workloads, feeder, info); // Main thread processes workloads #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } @@ -522,7 +522,7 @@ void CPPScheduler::run_workloads(std::vector &workloads) { #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */ thread_it = _impl->_threads.begin(); - for(unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it) + for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it) { std::exception_ptr current_exception = thread_it->wait(); if (current_exception) @@ -536,7 +536,7 @@ void CPPScheduler::run_workloads(std::vector &workloads) } #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::system_error &e) + catch (const std::system_error &e) { std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n'; } diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp index 5890553f6f..c46a2731d8 100644 --- a/src/runtime/CPP/SingleThreadScheduler.cpp +++ b/src/runtime/CPP/SingleThreadScheduler.cpp @@ -39,10 +39,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { const Window &max_window = kernel->window(); - if(hints.split_dimension() != IScheduler::split_dimensions_all) + if (hints.split_dimension() != IScheduler::split_dimensions_all) { const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - if(num_iterations < 1) + if (num_iterations < 1) { return; } @@ -53,7 +53,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints) kernel->run(kernel->window(), info); } -void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) +void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, + const Hints &hints, + const Window &window, + ITensorPack &tensors) { ARM_COMPUTE_UNUSED(hints); ThreadInfo info; @@ -65,7 +68,7 @@ void SingleThreadScheduler::run_workloads(std::vector &workloads) { ThreadInfo info; info.cpu_info = &cpu_info(); - for(auto &wl : workloads) + for (auto &wl : workloads) { wl(info); } diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp index dccbe4045d..94a1673d59 100644 --- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp +++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp @@ -42,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + dequantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + dequantize(*reinterpret_cast(input_it.ptr()), qinfo.scale, qinfo.offset); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -80,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output) Iterator input_it(input, window); Iterator output_it(output, window); - switch(data_type) + switch (data_type) { case DataType::QASYMM8: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + quantize_qasymm8(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM8_SIGNED: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + quantize_qasymm8_signed(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; case DataType::QASYMM16: - execute_window_loop(window, [&](const Coordinates &) - { - *reinterpret_cast(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast(input_it.ptr()), qinfo); - }, - input_it, output_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + *reinterpret_cast(output_it.ptr()) = + quantize_qasymm16(*reinterpret_cast(input_it.ptr()), qinfo); + }, + input_it, output_it); break; default: ARM_COMPUTE_ERROR("Unsupported data type"); @@ -132,14 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh { } -void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, - ITensor *scores_out, ITensor *boxes_out, ITensor *classes, ITensor *batch_splits_out, - ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info) +void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, + const ITensor *boxes_in, + const ITensor *batch_splits_in, + ITensor *scores_out, + ITensor *boxes_out, + ITensor *classes, + ITensor *batch_splits_out, + ITensor *keeps, + ITensor *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); - ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info); + ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, + keeps, keeps_size, info); - _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; + _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || + scores_in->info()->data_type() == DataType::QASYMM8_SIGNED; _scores_in = scores_in; _boxes_in = boxes_in; @@ -150,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _batch_splits_out = batch_splits_out; _keeps = keeps; - if(_is_qasymm8) + if (_is_qasymm8) { // Manage intermediate buffers _memory_group.manage(&_scores_in_f32); @@ -160,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _memory_group.manage(&_classes_f32); _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32)); _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_in != nullptr) + if (batch_splits_in != nullptr) { _memory_group.manage(&_batch_splits_in_f32); _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32)); @@ -168,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32)); _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32)); _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32)); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _memory_group.manage(&_batch_splits_out_f32); _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32)); } - if(keeps != nullptr) + if (keeps != nullptr) { _memory_group.manage(&_keeps_f32); _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32)); } - _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, + _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, + (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr, &_scores_out_f32, &_boxes_out_f32, &_classes_f32, - (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr, - keeps_size, info); + (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, + (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info); } else { - _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info); + _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, + batch_splits_out, keeps, keeps_size, info); } - if(_is_qasymm8) + if (_is_qasymm8) { _scores_in_f32.allocator()->allocate(); _boxes_in_f32.allocator()->allocate(); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { _batch_splits_in_f32.allocator()->allocate(); } _scores_out_f32.allocator()->allocate(); _boxes_out_f32.allocator()->allocate(); _classes_f32.allocator()->allocate(); - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { _batch_splits_out_f32.allocator()->allocate(); } - if(keeps != nullptr) + if (keeps != nullptr) { _keeps_f32.allocator()->allocate(); } } } -Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, - const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info) +Status validate(const ITensorInfo *scores_in, + const ITensorInfo *boxes_in, + const ITensorInfo *batch_splits_in, + const ITensorInfo *scores_out, + const ITensorInfo *boxes_out, + const ITensorInfo *classes, + const ITensorInfo *batch_splits_out, + const ITensorInfo *keeps, + const ITensorInfo *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); - const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; - if(is_qasymm8) + const bool is_qasymm8 = + scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED; + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out); @@ -237,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_qasymm8) + if (_is_qasymm8) { dequantize_tensor(_scores_in, &_scores_in_f32); dequantize_tensor(_boxes_in, &_boxes_in_f32); - if(_batch_splits_in != nullptr) + if (_batch_splits_in != nullptr) { dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32); } @@ -249,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run() Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY); - if(_is_qasymm8) + if (_is_qasymm8) { quantize_tensor(&_scores_out_f32, _scores_out); quantize_tensor(&_boxes_out_f32, _boxes_out); quantize_tensor(&_classes_f32, _classes); - if(_batch_splits_out != nullptr) + if (_batch_splits_out != nullptr) { quantize_tensor(&_batch_splits_out_f32, _batch_splits_out); } - if(_keeps != nullptr) + if (_keeps != nullptr) { quantize_tensor(&_keeps_f32, _keeps); } diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp index 41d875eb97..e6291f973e 100644 --- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include @@ -36,25 +36,35 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status validate_arguments(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, + "The priorbox input tensor should be [C3, 2, N]."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1"); const int num_priors = input_priorbox->tensor_shape()[0] / 4; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_loc_classes() * 4)) != + input_loc->tensor_shape()[0], + "Number of priors must match number of location predictions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast((num_priors * info.num_classes())) != + input_conf->tensor_shape()[0], + "Number of priors must match number of confidence predictions."); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { - const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output); } @@ -65,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input /** Function used to sort pair in descend order based on the score (first) value. */ template -bool SortScorePairDescend(const std::pair &pair1, - const std::pair &pair2) +bool SortScorePairDescend(const std::pair &pair1, const std::pair &pair2) { return pair1.first > pair2.first; } @@ -82,16 +91,19 @@ bool SortScorePairDescend(const std::pair &pair1, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, - const int num_priors, const int num_loc_classes, - const bool share_location, std::vector &all_location_predictions) +void retrieve_all_loc_predictions(const ITensor *input_loc, + const int num, + const int num_priors, + const int num_loc_classes, + const bool share_location, + std::vector &all_location_predictions) { - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { int label = share_location ? -1 : c; - if(all_location_predictions[i].find(label) == all_location_predictions[i].end()) + if (all_location_predictions[i].find(label) == all_location_predictions[i].end()) { all_location_predictions[i][label].resize(num_priors); } @@ -102,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - for(int c = 0; c < num_loc_classes; ++c) + for (int c = 0; c < num_loc_classes; ++c) { const int label = share_location ? -1 : c; const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4; //xmin, ymin, xmax, ymax - all_location_predictions[i][label][p][0] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr))); - all_location_predictions[i][label][p][1] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); - all_location_predictions[i][label][p][2] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); - all_location_predictions[i][label][p][3] = *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); + all_location_predictions[i][label][p][0] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr))); + all_location_predictions[i][label][p][1] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 1))); + all_location_predictions[i][label][p][2] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 2))); + all_location_predictions[i][label][p][3] = + *reinterpret_cast(input_loc->ptr_to_element(Coordinates(base_ptr + 3))); } } } @@ -130,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_conf_scores(const ITensor *input_conf, const int num, - const int num_priors, const int num_classes, +void retrieve_all_conf_scores(const ITensor *input_conf, + const int num, + const int num_priors, + const int num_classes, std::vector>> &all_confidence_scores) { std::vector tmp_buffer; tmp_buffer.resize(num * num_priors * num_classes); - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { - for(int p = 0; p < num_priors; ++p) + for (int p = 0; p < num_priors; ++p) { - tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = - *reinterpret_cast(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); + tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast( + input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c))); } } } - for(int i = 0; i < num; ++i) + for (int i = 0; i < num; ++i) { - for(int c = 0; c < num_classes; ++c) + for (int c = 0; c < num_classes; ++c) { all_confidence_scores[i][c].resize(num_priors); all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors], @@ -168,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num, * @param[out] all_location_predictions All the location predictions. * */ -void retrieve_all_priorbox(const ITensor *input_priorbox, - const int num_priors, - std::vector &all_prior_bboxes, +void retrieve_all_priorbox(const ITensor *input_priorbox, + const int num_priors, + std::vector &all_prior_bboxes, std::vector> &all_prior_variances) { - for(int i = 0; i < num_priors; ++i) + for (int i = 0; i < num_priors; ++i) { - all_prior_bboxes[i] = - { - { - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4))), - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), - *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3))) - } - }; + all_prior_bboxes[i] = {{*reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4))), + *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))), + *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))), + *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}}; } - std::array var({ { 0, 0, 0, 0 } }); - for(int i = 0; i < num_priors; ++i) + std::array var({{0, 0, 0, 0}}); + for (int i = 0; i < num_priors; ++i) { - for(int j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { var[j] = *reinterpret_cast(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j))); } @@ -208,13 +221,17 @@ void retrieve_all_priorbox(const ITensor *input_priorbox, * @param[out] decode_bbox The decoded bboxes. * */ -void DecodeBBox(const BBox &prior_bbox, const std::array &prior_variance, - const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target, - const bool clip_bbox, const BBox &bbox, BBox &decode_bbox) +void DecodeBBox(const BBox &prior_bbox, + const std::array &prior_variance, + const DetectionOutputLayerCodeType code_type, + const bool variance_encoded_in_target, + const bool clip_bbox, + const BBox &bbox, + BBox &decode_bbox) { // if the variance is encoded in target, we simply need to add the offset predictions // otherwise we need to scale the offset accordingly. - switch(code_type) + switch (code_type) { case DetectionOutputLayerCodeType::CORNER: { @@ -237,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.; const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.; - const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; - const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; - const float decode_bbox_width = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; - const float decode_bbox_height = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; + const float decode_bbox_center_x = + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x; + const float decode_bbox_center_y = + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y; + const float decode_bbox_width = + (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width; + const float decode_bbox_height = + (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height; decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f); decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f); @@ -258,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian ARM_COMPUTE_ERROR_ON(prior_width <= 0.f); ARM_COMPUTE_ERROR_ON(prior_height <= 0.f); - decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; - decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; - decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; - decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; + decode_bbox[0] = + prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width; + decode_bbox[1] = + prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height; + decode_bbox[2] = + prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width; + decode_bbox[3] = + prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height; break; } @@ -269,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type."); } - if(clip_bbox) + if (clip_bbox) { - for(auto &d_bbox : decode_bbox) + for (auto &d_bbox : decode_bbox) { d_bbox = utility::clamp(d_bbox, 0.f, 1.f); } @@ -289,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array &prior_varian * @param[out] indices The kept indices of bboxes after nms. * */ -void ApplyNMSFast(const std::vector &bboxes, - const std::vector &scores, const float score_threshold, - const float nms_threshold, const float eta, const int top_k, - std::vector &indices) +void ApplyNMSFast(const std::vector &bboxes, + const std::vector &scores, + const float score_threshold, + const float nms_threshold, + const float eta, + const int top_k, + std::vector &indices) { ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size."); @@ -300,9 +328,9 @@ void ApplyNMSFast(const std::vector &bboxes, std::list> score_index_vec; // Generate index score pairs. - for(size_t i = 0; i < scores.size(); ++i) + for (size_t i = 0; i < scores.size(); ++i) { - if(scores[i] > score_threshold) + if (scores[i] > score_threshold) { score_index_vec.emplace_back(std::make_pair(scores[i], i)); } @@ -313,7 +341,7 @@ void ApplyNMSFast(const std::vector &bboxes, // Keep top_k scores if needed. const int score_index_vec_size = score_index_vec.size(); - if(top_k > -1 && top_k < score_index_vec_size) + if (top_k > -1 && top_k < score_index_vec_size) { score_index_vec.resize(top_k); } @@ -322,46 +350,45 @@ void ApplyNMSFast(const std::vector &bboxes, float adaptive_threshold = nms_threshold; indices.clear(); - while(!score_index_vec.empty()) + while (!score_index_vec.empty()) { const int idx = score_index_vec.front().second; bool keep = true; - for(int kept_idx : indices) + for (int kept_idx : indices) { - if(keep) + if (keep) { // Compute the jaccard (intersection over union IoU) overlap between two bboxes. - BBox intersect_bbox = std::array({ 0, 0, 0, 0 }); - if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) + BBox intersect_bbox = std::array({0, 0, 0, 0}); + if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || + bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1]) { - intersect_bbox = std::array({ { 0, 0, 0, 0 } }); + intersect_bbox = std::array({{0, 0, 0, 0}}); } else { - intersect_bbox = std::array({ { - std::max(bboxes[idx][0], bboxes[kept_idx][0]), - std::max(bboxes[idx][1], bboxes[kept_idx][1]), - std::min(bboxes[idx][2], bboxes[kept_idx][2]), - std::min(bboxes[idx][3], bboxes[kept_idx][3]) - } - }); + intersect_bbox = std::array( + {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]), + std::min(bboxes[idx][2], bboxes[kept_idx][2]), + std::min(bboxes[idx][3], bboxes[kept_idx][3])}}); } float intersect_width = intersect_bbox[2] - intersect_bbox[0]; float intersect_height = intersect_bbox[3] - intersect_bbox[1]; float overlap = 0.f; - if(intersect_width > 0 && intersect_height > 0) + if (intersect_width > 0 && intersect_height > 0) { float intersect_size = intersect_width * intersect_height; - float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] - || bboxes[idx][3] < bboxes[idx][1]) ? - 0.f : - (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); - float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0] - || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ? - 0.f : - (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); + float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1]) + ? 0.f + : (bboxes[idx][2] - bboxes[idx][0]) * + (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]); + float bbox2_size = + (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1]) + ? 0.f + : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * + (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]); overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size); } keep = (overlap <= adaptive_threshold); @@ -371,12 +398,12 @@ void ApplyNMSFast(const std::vector &bboxes, break; } } - if(keep) + if (keep) { indices.push_back(idx); } score_index_vec.erase(score_index_vec.begin()); - if(keep && eta < 1.f && adaptive_threshold > 0.5f) + if (keep && eta < 1.f && adaptive_threshold > 0.5f) { adaptive_threshold *= eta; } @@ -385,13 +412,27 @@ void ApplyNMSFast(const std::vector &bboxes, } // namespace CPPDetectionOutputLayer::CPPDetectionOutputLayer() - : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(), - _all_prior_variances(), _all_decode_bboxes(), _all_indices() + : _input_loc(nullptr), + _input_conf(nullptr), + _input_priorbox(nullptr), + _output(nullptr), + _info(), + _num_priors(), + _num(), + _all_location_predictions(), + _all_confidence_scores(), + _all_prior_bboxes(), + _all_prior_variances(), + _all_decode_bboxes(), + _all_indices() { } -void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, - ITensor *output, DetectionOutputLayerInfo info) +void CPPDetectionOutputLayer::configure(const ITensor *input_loc, + const ITensor *input_conf, + const ITensor *input_priorbox, + ITensor *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output); ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info); @@ -400,11 +441,13 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum // The maximum is keep_top_k * input_loc_size[1] // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax] - const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); + const unsigned int max_size = + info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1); auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size))); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info)); _input_loc = input_loc; _input_conf = input_conf; @@ -420,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor _all_prior_variances.resize(_num_priors); _all_decode_bboxes.resize(_num); - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; @@ -440,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); } -Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info) +Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, + const ITensorInfo *input_conf, + const ITensorInfo *input_priorbox, + const ITensorInfo *output, + DetectionOutputLayerInfo info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info)); return Status{}; @@ -449,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe void CPPDetectionOutputLayer::run() { // Retrieve all location predictions. - retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions); + retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), + _all_location_predictions); // Retrieve all confidences. retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores); @@ -459,75 +507,79 @@ void CPPDetectionOutputLayer::run() // Decode all loc predictions to bboxes const bool clip_bbox = false; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - for(int c = 0; c < _info.num_loc_classes(); ++c) + for (int c = 0; c < _info.num_loc_classes(); ++c) { const int label = _info.share_location() ? -1 : c; - if(label == _info.background_label_id()) + if (label == _info.background_label_id()) { // Ignore background class. continue; } - ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label); + ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), + "Could not find location predictions for label %d.", label); const std::vector &label_loc_preds = _all_location_predictions[i].find(label)->second; const int num_bboxes = _all_prior_bboxes.size(); ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4); - for(int j = 0; j < num_bboxes; ++j) + for (int j = 0; j < num_bboxes; ++j) { - DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]); + DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), + _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], + _all_decode_bboxes[i][label][j]); } } } int num_kept = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - const std::map> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + const std::map> &conf_scores = _all_confidence_scores[i]; std::map> indices; - int num_det = 0; - for(int c = 0; c < _info.num_classes(); ++c) + int num_det = 0; + for (int c = 0; c < _info.num_classes(); ++c) { - if(c == _info.background_label_id()) + if (c == _info.background_label_id()) { // Ignore background class continue; } const int label = _info.share_location() ? -1 : c; - if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) + if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector &scores = conf_scores.find(c)->second; - const std::vector &bboxes = decode_bboxes.find(label)->second; + const std::vector &bboxes = decode_bboxes.find(label)->second; - ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]); + ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), + _info.top_k(), indices[c]); num_det += indices[c].size(); } int num_to_add = 0; - if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) + if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k()) { std::vector>> score_index_pairs; - for(auto const &it : indices) + for (auto const &it : indices) { const int label = it.first; const std::vector &label_indices = it.second; - if(conf_scores.find(label) == conf_scores.end()) + if (conf_scores.find(label) == conf_scores.end()) { ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label); } const std::vector &scores = conf_scores.find(label)->second; - for(auto idx : label_indices) + for (auto idx : label_indices) { ARM_COMPUTE_ERROR_ON(idx > static_cast(scores.size())); score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx))); @@ -541,7 +593,7 @@ void CPPDetectionOutputLayer::run() // Store the new indices. std::map> new_indices; - for(auto score_index_pair : score_index_pairs) + for (auto score_index_pair : score_index_pairs) { int label = score_index_pair.second.first; int idx = score_index_pair.second.second; @@ -562,25 +614,25 @@ void CPPDetectionOutputLayer::run() _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept))); int count = 0; - for(int i = 0; i < _num; ++i) + for (int i = 0; i < _num; ++i) { - const std::map> &conf_scores = _all_confidence_scores[i]; - const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; - for(auto &it : _all_indices[i]) + const std::map> &conf_scores = _all_confidence_scores[i]; + const LabelBBox &decode_bboxes = _all_decode_bboxes[i]; + for (auto &it : _all_indices[i]) { const int label = it.first; const std::vector &scores = conf_scores.find(label)->second; const int loc_label = _info.share_location() ? -1 : label; - if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) + if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end()) { // Either if there are no confidence predictions // or there are no location predictions for current label. ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label); } const std::vector &bboxes = decode_bboxes.find(loc_label)->second; - const std::vector &indices = it.second; + const std::vector &indices = it.second; - for(auto idx : indices) + for (auto idx : indices) { *(reinterpret_cast(_output->ptr_to_element(Coordinates(count * 7)))) = i; *(reinterpret_cast(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label; diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp index ecbc49b3c1..2861d6cacb 100644 --- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include #include @@ -38,53 +38,76 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, - DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox) +Status validate_arguments(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info, + const unsigned int kBatchSize, + const unsigned int kNumCoordBox) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize]."); - if(input_box_encoding->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, + "The location input tensor shape should be [4, N, kBatchSize]."); + if (input_box_encoding->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR( + input_box_encoding->dimension(2) != kBatchSize, + "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1), - "The first dimension of the input class_prediction should be equal to the number of classes plus one."); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize]."); - if(input_anchors->num_dimensions() > 2) + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, + "The first dimension of the input box_encoding tensor should be equal to %d.", + kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input_class_score->dimension(0) != (info.num_classes() + 1), + "The first dimension of the input class_prediction should be equal to the number of classes plus one."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, + "The anchors input tensor shape should be [4, N, kBatchSize]."); + if (input_anchors->num_dimensions() > 2) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox); + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, + "The first dimension of the input anchors tensor should be equal to %d.", + kNumCoordBox); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) - || (input_box_encoding->dimension(1) != input_anchors->dimension(1)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) || + (input_box_encoding->dimension(1) != input_anchors->dimension(1)), "The second dimension of the inputs should be the same."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, + "The num_detection output tensor shape should be [M]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), + "The intersection over union should be positive and less than 1."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, + "The number of max classes per detection should be positive."); const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection(); // Validate configured outputs - if(output_boxes->total_size() != 0) + if (output_boxes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), + TensorShape(4U, num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32); } - if(output_classes->total_size() != 0) + if (output_classes->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32); } - if(output_scores->total_size() != 0) + if (output_scores->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), + TensorShape(num_detected_boxes, 1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32); } - if(num_detection->total_size() != 0) + if (num_detection->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32); @@ -93,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn return Status{}; } -inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) +inline void +DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info) { const float half_factor = 0.5f; // BBox is equavalent to CenterSizeEncoding [y,x,h,w] const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0]; const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1]; - const float half_h = half_factor * static_cast(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; - const float half_w = half_factor * static_cast(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; + const float half_h = + half_factor * static_cast(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2]; + const float half_w = + half_factor * static_cast(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3]; // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax] auto decoded_ptr = reinterpret_cast(decoded_it.ptr()); @@ -118,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode * @param[in] info The detection informations * @param[out] decoded_boxes The decoded bboxes. */ -void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes) +void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, + const ITensor *input_anchors, + DetectionPostProcessLayerInfo info, + Tensor *decoded_boxes) { const QuantizationInfo &qi_box = input_box_encoding->info()->quantization_info(); const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info(); - BBox box_centersize{ {} }; - BBox anchor{ {} }; + BBox box_centersize{{}}; + BBox anchor{{}}; Window win; win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape()); @@ -133,107 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp Iterator anchor_it(input_anchors, win); Iterator decoded_it(decoded_boxes, win); - if(input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (input_box_encoding->info()->data_type() == DataType::QASYMM8) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), - dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); + box_centersize = + BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box), + dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors), + dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } - else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); - box_centersize = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), - dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box) - }); - anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), - dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors) - }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); + box_centersize = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box), + dequantize_qasymm8_signed(*(box_ptr + 1), qi_box), + dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), + dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)}); + anchor = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), + dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors), + dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), + dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto box_ptr = reinterpret_cast(box_it.ptr()); - const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); - box_centersize = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) }); - anchor = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) }); - DecodeBoxCorner(box_centersize, anchor, decoded_it, info); - }, - box_it, anchor_it, decoded_it); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto box_ptr = reinterpret_cast(box_it.ptr()); + const auto anchor_ptr = reinterpret_cast(anchor_it.ptr()); + box_centersize = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)}); + anchor = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)}); + DecodeBoxCorner(box_centersize, anchor, decoded_it, info); + }, + box_it, anchor_it, decoded_it); } } -void SaveOutputs(const Tensor *decoded_boxes, const std::vector &result_idx_boxes_after_nms, const std::vector &result_scores_after_nms, const std::vector &result_classes_after_nms, - std::vector &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, - ITensor *num_detection) +void SaveOutputs(const Tensor *decoded_boxes, + const std::vector &result_idx_boxes_after_nms, + const std::vector &result_scores_after_nms, + const std::vector &result_classes_after_nms, + std::vector &sorted_indices, + const unsigned int num_output, + const unsigned int max_detections, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection) { // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax unsigned int i = 0; - for(; i < num_output; ++i) + for (; i < num_output; ++i) { const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]]; - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); - *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); - *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = static_cast(result_classes_after_nms[sorted_indices[i]]); - *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = result_scores_after_nms[sorted_indices[i]]; + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(0, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx)))); + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(1, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx)))); + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(2, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx)))); + *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(3, i)))) = + *(reinterpret_cast(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx)))); + *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = + static_cast(result_classes_after_nms[sorted_indices[i]]); + *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = + result_scores_after_nms[sorted_indices[i]]; } - for(; i < max_detections; ++i) + for (; i < max_detections; ++i) { *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f; *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f; *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f; *(reinterpret_cast(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f; - *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; - *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f; + *(reinterpret_cast(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f; } *(reinterpret_cast(num_detection->ptr_to_element(Coordinates(0)))) = num_output; } } // namespace CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr), - _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(), - _selected_indices(), _class_scores(), _input_scores_to_use(nullptr) + : _memory_group(std::move(memory_manager)), + _nms(), + _input_box_encoding(nullptr), + _input_scores(nullptr), + _input_anchors(nullptr), + _output_boxes(nullptr), + _output_classes(nullptr), + _output_scores(nullptr), + _num_detection(nullptr), + _info(), + _num_boxes(), + _num_classes_with_background(), + _num_max_detected_boxes(), + _dequantize_scores(false), + _decoded_boxes(), + _decoded_scores(), + _selected_indices(), + _class_scores(), + _input_scores_to_use(nullptr) { } -void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, - const ITensor *input_anchors, ITensor *output_boxes, ITensor *output_classes, - ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info); _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection(); - auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_boxes->info(), + TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_classes->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); + auto_init_if_empty(*output_scores->info(), + TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32)); auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(), - num_detection->info(), - info, _kBatchSize, _kNumCoordBox)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox)); _input_box_encoding = input_box_encoding; _input_scores = input_scores; @@ -245,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _info = info; _num_boxes = input_box_encoding->info()->dimension(1); _num_classes_with_background = _input_scores->info()->dimension(0); - _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); - - auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32)); + _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type())); + + auto_init_if_empty(*_decoded_boxes.info(), + TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, + DataType::F32)); + auto_init_if_empty( + *_decoded_scores.info(), + TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), + 1, DataType::F32)); + auto_init_if_empty( + *_selected_indices.info(), + TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, + DataType::S32)); const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes()); - auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32)); + auto_init_if_empty( + *_class_scores.info(), + TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, + DataType::F32)); _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores; @@ -260,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _memory_group.manage(&_decoded_scores); _memory_group.manage(&_selected_indices); _memory_group.manage(&_class_scores); - _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold()); + _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, + info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), + info.nms_score_threshold(), info.iou_threshold()); // Allocate and reserve intermediate tensors and vectors _decoded_boxes.allocator()->allocate(); @@ -269,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _class_scores.allocator()->allocate(); } -Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { - constexpr unsigned int kBatchSize = 1; - constexpr unsigned int kNumCoordBox = 4; - const TensorInfo _decoded_boxes_info = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); - const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); - - ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(), - info.iou_threshold())); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox)); + constexpr unsigned int kBatchSize = 1; + constexpr unsigned int kNumCoordBox = 4; + const TensorInfo _decoded_boxes_info = + TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32); + const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32); + + ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, + &_selected_indices_info, info.max_detections(), + info.nms_score_threshold(), info.iou_threshold())); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info, kBatchSize, + kNumCoordBox)); return Status{}; } @@ -293,62 +393,69 @@ void CPPDetectionPostProcessLayer::run() DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes); // Decode scores if necessary - if(_dequantize_scores) + if (_dequantize_scores) { - if(_input_box_encoding->info()->data_type() == DataType::QASYMM8) + if (_input_box_encoding->info()->data_type() == DataType::QASYMM8) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8(*(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8( + *(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } - else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) + else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED) { - for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) + for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c) { - for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) + for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b) { *(reinterpret_cast(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) = - dequantize_qasymm8_signed(*(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info()); + dequantize_qasymm8_signed(*(reinterpret_cast( + _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), + _input_scores->info()->quantization_info()); } } } } // Regular NMS - if(_info.use_regular_nms()) + if (_info.use_regular_nms()) { std::vector result_idx_boxes_after_nms; std::vector result_classes_after_nms; std::vector result_scores_after_nms; std::vector sorted_indices; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { // For each boxes get scores of the boxes for the class c - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(i)))) = - *(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 + *(reinterpret_cast(_input_scores_to_use->ptr_to_element( + Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 } // Run Non-maxima Suppression _nms.run(); - for(unsigned int i = 0; i < _info.detection_per_class(); ++i) + for (unsigned int i = 0; i < _info.detection_per_class(); ++i) { - const auto selected_index = *(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))); - if(selected_index == -1) + const auto selected_index = + *(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))); + if (selected_index == -1) { // Nms will return -1 for all the last M-elements not valid break; } result_idx_boxes_after_nms.emplace_back(selected_index); - result_scores_after_nms.emplace_back((reinterpret_cast(_class_scores.buffer()))[selected_index]); + result_scores_after_nms.emplace_back( + (reinterpret_cast(_class_scores.buffer()))[selected_index]); result_classes_after_nms.emplace_back(c); } } @@ -360,49 +467,46 @@ void CPPDetectionPostProcessLayer::run() // Sort selected indices based on result scores sorted_indices.resize(num_selected); std::iota(sorted_indices.begin(), sorted_indices.end(), 0); - std::partial_sort(sorted_indices.data(), - sorted_indices.data() + num_output, + std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output, sorted_indices.data() + num_selected, [&](unsigned int first, unsigned int second) - { - - return result_scores_after_nms[first] > result_scores_after_nms[second]; - }); + { return result_scores_after_nms[first] > result_scores_after_nms[second]; }); - SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, + sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, + _num_detection); } // Fast NMS else { - const unsigned int num_classes_per_box = std::min(_info.max_classes_per_detection(), _info.num_classes()); + const unsigned int num_classes_per_box = + std::min(_info.max_classes_per_detection(), _info.num_classes()); std::vector max_scores; std::vector box_indices; std::vector max_score_classes; - for(unsigned int b = 0; b < _num_boxes; ++b) + for (unsigned int b = 0; b < _num_boxes; ++b) { std::vector box_scores; - for(unsigned int c = 0; c < num_classes; ++c) + for (unsigned int c = 0; c < num_classes; ++c) { - box_scores.emplace_back(*(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); + box_scores.emplace_back( + *(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); } std::vector max_score_indices; max_score_indices.resize(_info.num_classes()); std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0); - std::partial_sort(max_score_indices.data(), - max_score_indices.data() + num_classes_per_box, + std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box, max_score_indices.data() + num_classes, [&](unsigned int first, unsigned int second) - { - return box_scores[first] > box_scores[second]; - }); + { return box_scores[first] > box_scores[second]; }); - for(unsigned int i = 0; i < num_classes_per_box; ++i) + for (unsigned int i = 0; i < num_classes_per_box; ++i) { - const float score_to_add = box_scores[max_score_indices[i]]; - *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add; + const float score_to_add = box_scores[max_score_indices[i]]; + *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = + score_to_add; max_scores.emplace_back(score_to_add); box_indices.emplace_back(b); max_score_classes.emplace_back(max_score_indices[i]); @@ -412,10 +516,10 @@ void CPPDetectionPostProcessLayer::run() // Run Non-maxima Suppression _nms.run(); std::vector selected_indices; - for(unsigned int i = 0; i < max_detections; ++i) + for (unsigned int i = 0; i < max_detections; ++i) { // NMS returns M valid indices, the not valid tail is filled with -1 - if(*(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) + if (*(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i)))) == -1) { // Nms will return -1 for all the last M-elements not valid break; @@ -425,8 +529,8 @@ void CPPDetectionPostProcessLayer::run() // We select the max detection numbers of the highest score of all classes const auto num_output = std::min(_info.max_detections(), selected_indices.size()); - SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, - num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output, + max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); } } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp index 6d01b127c0..3217742c6b 100644 --- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp +++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp @@ -29,9 +29,12 @@ namespace arm_compute { -void CPPNonMaximumSuppression::configure( - const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +void CPPNonMaximumSuppression::configure(const ITensor *bboxes, + const ITensor *scores, + ITensor *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); @@ -40,10 +43,14 @@ void CPPNonMaximumSuppression::configure( _kernel = std::move(k); } -Status CPPNonMaximumSuppression::validate( - const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size, - const float score_threshold, const float nms_threshold) +Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *indices, + unsigned int max_output_size, + const float score_threshold, + const float nms_threshold) { - return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold); + return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, + nms_threshold); } } // namespace arm_compute diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp index 62a74735a2..3d64def804 100644 --- a/src/runtime/CPP/functions/CPPTopKV.cpp +++ b/src/runtime/CPP/functions/CPPTopKV.cpp @@ -38,7 +38,10 @@ void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITe _kernel = std::move(kernel); } -Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status CPPTopKV::validate(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { return CPPTopKVKernel::validate(predictions, targets, output, k); } diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index 436fd9ca16..ecf84abd2c 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Log.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuInfo.h" #include "src/runtime/SchedulerUtils.h" @@ -59,7 +60,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); #ifndef BARE_METAL const Window &max_window = window; - if(hints.split_dimension() == IScheduler::split_dimensions_all) + if (hints.split_dimension() == IScheduler::split_dimensions_all) { /* * if the split dim is size_t max then this signals we should parallelise over @@ -73,27 +74,27 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n); std::vector workloads; - for(unsigned int ni = 0; ni != n_threads; ++ni) + for (unsigned int ni = 0; ni != n_threads; ++ni) { - for(unsigned int mi = 0; mi != m_threads; ++mi) + for (unsigned int mi = 0; mi != m_threads; ++mi) { workloads.push_back( - [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info) - { - //narrow the window to our mi-ni workload - Window win = max_window.split_window(Window::DimX, mi, m_threads) - .split_window(Window::DimY, ni, n_threads); + [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); - win.validate(); + win.validate(); - Window thread_locator; - thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); - thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); - thread_locator.validate(); + thread_locator.validate(); - kernel->run_nd(win, info, thread_locator); - }); + kernel->run_nd(win, info, thread_locator); + }); } } run_workloads(workloads); @@ -103,16 +104,16 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, this->num_threads()); - if(num_iterations == 0) + if (num_iterations == 0) { return; } - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; info.cpu_info = &cpu_info(); - if(tensors.empty()) + if (tensors.empty()) { kernel->run(max_window, info); } @@ -124,14 +125,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W else { unsigned int num_windows = 0; - switch(hints.strategy()) + switch (hints.strategy()) { case StrategyHint::STATIC: num_windows = num_threads; break; case StrategyHint::DYNAMIC: { - const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast(hints.threshold()); + const unsigned int granule_threshold = + (hints.threshold() <= 0) ? num_threads : static_cast(hints.threshold()); // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; break; @@ -143,15 +145,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info()); std::vector workloads(num_windows); - for(unsigned int t = 0; t < num_windows; ++t) + for (unsigned int t = 0; t < num_windows; ++t) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); - if(tensors.empty()) + if (tensors.empty()) { kernel->run(win, info); } @@ -175,36 +177,43 @@ void IScheduler::run_tagged_workloads(std::vector &workloads, const ch run_workloads(workloads); } -std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info) +std::size_t IScheduler::adjust_num_of_windows(const Window &window, + std::size_t split_dimension, + std::size_t init_num_windows, + const ICPPKernel &kernel, + const CPUInfo &cpu_info) { // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow"). - if(window.num_iterations(split_dimension) < init_num_windows) + if (window.num_iterations(split_dimension) < init_num_windows) { auto recommended_split_dim = Window::DimX; - for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims) + for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims) { - if(window.num_iterations(recommended_split_dim) < window.num_iterations(dims)) + if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims)) { recommended_split_dim = dims; } } - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", split_dimension, - recommended_split_dim); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", + split_dimension, recommended_split_dim); } - for(auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first + for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first { // Try splitting the workload into t, subject to each subworkload size <= mws. - if((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t) + if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t) { - if(t != init_num_windows) + if (t != init_num_windows) { - ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using a different thread count than the one assigned by the user."); + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using a different thread count than the one assigned by the user."); } return t; } } - ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using single thread instead of the thread count assigned by the user."); + ARM_COMPUTE_LOG_INFO_MSG_CORE( + "The scheduler is using single thread instead of the thread count assigned by the user."); return 1; // If the workload is so small that it can't be split, we should run a single thread } diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp index a6bc950644..8e5b62ae7d 100644 --- a/src/runtime/ISimpleLifetimeManager.cpp +++ b/src/runtime/ISimpleLifetimeManager.cpp @@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager() void ISimpleLifetimeManager::register_group(IMemoryGroup *group) { - if(_active_group == nullptr) + if (_active_group == nullptr) { ARM_COMPUTE_ERROR_ON(group == nullptr); _active_group = group; @@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group) bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) { - if(group == nullptr) + if (group == nullptr) { return false; } const bool status = bool(_finalized_groups.erase(group)); - if(status) + if (status) { group->mappings().clear(); } @@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group) void ISimpleLifetimeManager::start_lifetime(void *obj) { ARM_COMPUTE_ERROR_ON(obj == nullptr); - ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!"); + ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), + "Memory object is already registered!"); // Check if there is a free blob - if(_free_blobs.empty()) + if (_free_blobs.empty()) { - _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } }); + _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}}); } else { @@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t el.status = true; // Find object in the occupied lists - auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b) - { - return obj == b.id; - }); + auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), + [&obj](const Blob &b) { return obj == b.id; }); ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs)); // Update occupied blob and return as free @@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it); // Check if all objects are finalized and reset active group - if(are_all_finalized()) + if (are_all_finalized()) { ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty()); @@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t bool ISimpleLifetimeManager::are_all_finalized() const { - return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair &e) - { - return !e.second.status; - }); + return !std::any_of(std::begin(_active_elements), std::end(_active_elements), + [](const std::pair &e) { return !e.second.status; }); } } // namespace arm_compute diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp index 373c50c73d..96287dcc49 100644 --- a/src/runtime/IWeightsManager.cpp +++ b/src/runtime/IWeightsManager.cpp @@ -25,14 +25,13 @@ namespace arm_compute { -IWeightsManager::IWeightsManager() - : _managed_weights(), _managed_counter(), _managed_weights_parents() +IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents() { } void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent) { - if(!are_weights_managed(weights)) + if (!are_weights_managed(weights)) { _managed_weights[weights]; _managed_counter[weights]; @@ -44,9 +43,9 @@ void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent) // In case the weights are an output of a previous reshape function // store the parent's link - if(parent != nullptr) + if (parent != nullptr) { - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { _managed_weights_parents[weights] = parent; } @@ -59,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Find if I have the same weights with weights transform. If I do, don't run the reshape auto item = _managed_weights.find(weights); - bool perform_run{ true }; - ITensor *weights_tensor{ nullptr }; + bool perform_run{true}; + ITensor *weights_tensor{nullptr}; // Check if I already have the requested transform and I have run the reshape function - for(auto it : item->second) + for (auto it : item->second) { - if(it->is_reshape_run() && (it->uid() == weights_transform->uid())) + if (it->is_reshape_run() && (it->uid() == weights_transform->uid())) { weights_tensor = it->get_weights(); perform_run = false; @@ -73,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights } } - if(perform_run) + if (perform_run) { weights_transform->run(); weights_tensor = weights_transform->get_weights(); @@ -81,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check if we can release memory from parent auto parent_item = _managed_weights_parents.find(weights); - if(parent_item != _managed_weights_parents.end()) + if (parent_item != _managed_weights_parents.end()) { int32_t refcount = parent_item->second->decrease_refcount(); - if(refcount == 0) + if (refcount == 0) { parent_item->second->release(); } @@ -92,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights // Check top level weights. If all the transformations are done // mark the weights as unused - if(_managed_weights_parents.find(weights) == _managed_weights_parents.end()) + if (_managed_weights_parents.find(weights) == _managed_weights_parents.end()) { auto item = _managed_weights.find(weights); bool mark_as_unused = true; - for(auto it : item->second) + for (auto it : item->second) { - if(!it->is_reshape_run()) + if (!it->is_reshape_run()) { mark_as_unused = false; break; } } - if(mark_as_unused) + if (mark_as_unused) { weights->mark_as_unused(); } @@ -123,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei { ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed"); - ITensor *transformed_weights{ nullptr }; + ITensor *transformed_weights{nullptr}; auto item = _managed_weights.find(weights); // Check if I already have the requested transform. If I do, // increase the refcount of the transformed weights object and // reuse the tensor - for(auto it : item->second) + for (auto it : item->second) { - if(it->uid() == weights_transform->uid()) + if (it->uid() == weights_transform->uid()) { transformed_weights = it->get_weights(); it->increase_refcount(); @@ -139,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei } } - if(transformed_weights == nullptr) + if (transformed_weights == nullptr) { transformed_weights = weights_transform->get_weights(); weights_transform->increase_refcount(); @@ -154,13 +153,13 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei void IWeightsManager::release(const ITensor *weights) { - if(weights == nullptr || !are_weights_managed(weights)) + if (weights == nullptr || !are_weights_managed(weights)) { return; } _managed_counter[weights].counter--; - if(_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused) + if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused) { weights->mark_as_unused(); } @@ -168,7 +167,7 @@ void IWeightsManager::release(const ITensor *weights) void IWeightsManager::pre_mark_as_unused(const ITensor *weights) { - if(weights == nullptr || !are_weights_managed(weights)) + if (weights == nullptr || !are_weights_managed(weights)) { return; } diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp index ac0a32539e..90fd025eb7 100644 --- a/src/runtime/Memory.cpp +++ b/src/runtime/Memory.cpp @@ -27,20 +27,17 @@ namespace arm_compute { -Memory::Memory() - : _region(nullptr), _region_owned(nullptr) +Memory::Memory() : _region(nullptr), _region_owned(nullptr) { } -Memory::Memory(const std::shared_ptr &memory) - : _region(nullptr), _region_owned(memory) +Memory::Memory(const std::shared_ptr &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -Memory::Memory(IMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp index 2e418ae9e3..5fa9ea47e9 100644 --- a/src/runtime/MemoryManagerOnDemand.cpp +++ b/src/runtime/MemoryManagerOnDemand.cpp @@ -31,7 +31,8 @@ namespace arm_compute { -MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr lifetime_manager, std::shared_ptr pool_manager) +MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr lifetime_manager, + std::shared_ptr pool_manager) : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)) { ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!"); @@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t // Create pools auto pool_template = _lifetime_mgr->create_pool(&allocator); - for(int i = num_pools; i > 1; --i) + for (int i = num_pools; i > 1; --i) { auto pool = pool_template->duplicate(); _pool_mgr->register_pool(std::move(pool)); diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp index a5fc0a2726..fcfd3251ff 100644 --- a/src/runtime/NEON/INEOperator.cpp +++ b/src/runtime/NEON/INEOperator.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/INEOperator.h" + #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -32,14 +34,13 @@ namespace experimental { INEOperator::~INEOperator() = default; -INEOperator::INEOperator(IRuntimeContext *ctx) - : _kernel(), _ctx(ctx), _workspace() +INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } void INEOperator::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp index 5438bce62a..b6977221b9 100644 --- a/src/runtime/NEON/INESimpleFunction.cpp +++ b/src/runtime/NEON/INESimpleFunction.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute @@ -33,8 +34,7 @@ namespace arm_compute INESimpleFunction::~INESimpleFunction() = default; INESimpleFunction::INESimpleFunction() // NOLINT - : _kernel(), - _border_handler() + : _kernel(), _border_handler() { } diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp index 21dd58e378..04bff9fa4b 100644 --- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp +++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/INEKernel.h" #include "src/runtime/Utils.h" @@ -32,9 +33,7 @@ namespace arm_compute { INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default; -INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) - : _kernel(), - _ctx(ctx) +INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx) { } diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp index e48aede590..59199452ce 100644 --- a/src/runtime/NEON/functions/NEActivationLayer.cpp +++ b/src/runtime/NEON/functions/NEActivationLayer.cpp @@ -24,24 +24,24 @@ #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuActivation.h" namespace arm_compute { struct NEActivationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - IRuntimeContext *ctx{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + IRuntimeContext *ctx{nullptr}; + std::unique_ptr op{nullptr}; }; -NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) - : _impl(std::make_unique()) +NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique()) { _impl->ctx = ctx; } -NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; +NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default; NEActivationLayer::~NEActivationLayer() = default; @@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info); } -Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return cpu::CpuActivation::validate(input, output, act_info); } diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp index cfeaefc4fd..a72364791c 100644 --- a/src/runtime/NEON/functions/NEAddMulAdd.cpp +++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp @@ -25,6 +25,7 @@ #include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuAddMulAdd.h" @@ -33,45 +34,50 @@ namespace arm_compute { struct NEAddMulAdd::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; WorkspaceData workspace_tensors{}; ITensorPack run_pack{}; MemoryGroup memory_group{}; }; -NEAddMulAdd::NEAddMulAdd(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEAddMulAdd::NEAddMulAdd(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } NEAddMulAdd::~NEAddMulAdd() = default; -void NEAddMulAdd::configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add, ITensor *add_output, - ITensor *final_output, const ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEAddMulAdd::configure(ITensor *input1, + ITensor *input2, + ITensor *bn_mul, + ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + const ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); - _impl->op = std::make_unique(); - _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), - bn_add->info(), add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); + _impl->op = std::make_unique(); + _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(), + add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input1 }, - { TensorType::ACL_SRC_1, input2 }, - { TensorType::ACL_SRC_2, bn_mul }, - { TensorType::ACL_SRC_3, bn_add }, - { TensorType::ACL_DST_0, add_output }, - { TensorType::ACL_DST_1, final_output }, + _impl->run_pack = { + {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2}, {TensorType::ACL_SRC_2, bn_mul}, + {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output}, }; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *bn_mul, - const ITensorInfo *bn_add, const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); } diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp index 3ac127b02e..fbaf1a96e7 100644 --- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp +++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp @@ -32,6 +32,7 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -48,8 +49,7 @@ struct NEArgMinMaxLayer::Impl NEArgMinMaxLayer::~NEArgMinMaxLayer() = default; -NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } @@ -58,7 +58,8 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons { ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); _impl->reduction_function = std::make_unique(); - if(output->info() && (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) + if (output->info() && + (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->cast_function = std::make_unique(); @@ -74,9 +75,11 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons } } -Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid operation"); return NEReductionOperation::validate(input, output, axis, op, false); } @@ -84,7 +87,7 @@ void NEArgMinMaxLayer::run() { MemoryGroupResourceScope scope_mg(_impl->memory_group); _impl->reduction_function->run(); - if(_impl->tmp_reduction_result != nullptr) + if (_impl->tmp_reduction_result != nullptr) { _impl->cast_function->run(); } diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index a7581ca9f4..aff16ae9d1 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuAdd.h" #include @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticAddition::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEArithmeticAddition::NEArithmeticAddition() - : _impl(std::make_unique()) +NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique()) { } -NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; +NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default; NEArithmeticAddition::~NEArithmeticAddition() = default; -Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuAdd::validate(input1, input2, output, policy, act_info); } -void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticAddition::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 6fdd4267bf..097525c1a8 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuSub.h" #include @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticSubtraction::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEArithmeticSubtraction::NEArithmeticSubtraction() - : _impl(std::make_unique()) +NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique()) { } -NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction::~NEArithmeticSubtraction() = default; -Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuSub::validate(input1, input2, output, policy, act_info); } -void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticSubtraction::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp index db49f4c1a0..d491f0aafc 100644 --- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" @@ -36,12 +37,17 @@ namespace arm_compute { NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default; -NEBatchNormalizationLayer::NEBatchNormalizationLayer() - : _norm_kernel() +NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel() { } -void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, +void NEBatchNormalizationLayer::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, ActivationLayerInfo act_info) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); @@ -50,10 +56,17 @@ void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info); } -Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); return Status{}; } diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp index e258028d05..5d711c5ddf 100644 --- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h" @@ -41,19 +42,25 @@ void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_s _kernel = std::move(k); } -void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) +void NEBatchToSpaceLayer::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { auto k = std::make_unique(); k->configure(input, block_shape_x, block_shape_y, output, crop_info); _kernel = std::move(k); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return NEBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp index 90eb72706e..89ce2087be 100644 --- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp +++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h" -#include "src/core/NEON/kernels/NEBitwiseAndKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseAndKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp index 69e5288b88..eda59cd3e9 100644 --- a/src/runtime/NEON/functions/NEBitwiseNot.cpp +++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h" -#include "src/core/NEON/kernels/NEBitwiseNotKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseNotKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp index 0b19e919ee..3d6f30b0fe 100644 --- a/src/runtime/NEON/functions/NEBitwiseOr.cpp +++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h" -#include "src/core/NEON/kernels/NEBitwiseOrKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseOrKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp index cc9df9f1c4..f0cf3d3e5c 100644 --- a/src/runtime/NEON/functions/NEBitwiseXor.cpp +++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h" -#include "src/core/NEON/kernels/NEBitwiseXorKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseXorKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp index af00171be6..adf891e417 100644 --- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp +++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp @@ -22,12 +22,16 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h" namespace arm_compute { -void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransform::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); // Configure Bounding Box kernel @@ -36,7 +40,10 @@ void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes _kernel = std::move(k); } -Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp index f93a6ea745..1fd172a730 100644 --- a/src/runtime/NEON/functions/NECast.cpp +++ b/src/runtime/NEON/functions/NECast.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuCast.h" @@ -31,16 +32,15 @@ namespace arm_compute { struct NECast::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NECast::NECast() - : _impl(std::make_unique()) +NECast::NECast() : _impl(std::make_unique()) { } -NECast::NECast(NECast &&) = default; +NECast::NECast(NECast &&) = default; NECast &NECast::operator=(NECast &&) = default; NECast::~NECast() = default; @@ -62,7 +62,7 @@ Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, Con void NECast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp index 8b96fadb74..86bee4dd43 100644 --- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp +++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h" diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp index ceb697aad6..59a0892f1f 100644 --- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp +++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp @@ -23,33 +23,31 @@ */ #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" -#include "src/cpu/operators/CpuConcatenate.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/operators/CpuConcatenate.h" namespace arm_compute { struct NEConcatenateLayer::Impl { std::vector srcs{}; - ITensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr op{ nullptr }; + ITensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr op{nullptr}; }; -NEConcatenateLayer::NEConcatenateLayer() - : _impl(std::make_unique()) +NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique()) { } -NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; +NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default; NEConcatenateLayer::~NEConcatenateLayer() = default; @@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector inputs_vector, I _impl->op = std::make_unique(); std::vector inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector inputs_vector, I _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis); } -Status NEConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +Status NEConcatenateLayer::validate(const std::vector &inputs_vector, + const ITensorInfo *output, + size_t axis) { return cpu::CpuConcatenate::validate(inputs_vector, output, axis); } @@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector &inpu void NEConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp index 3bb66c44b0..8f41151d6c 100644 --- a/src/runtime/NEON/functions/NEConv3D.cpp +++ b/src/runtime/NEON/functions/NEConv3D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDirectConv3d.h" @@ -35,35 +36,41 @@ using namespace arm_compute::experimental; struct NEConv3D::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; }; -NEConv3D::NEConv3D() - : _impl(std::make_unique()) +NEConv3D::NEConv3D() : _impl(std::make_unique()) { } NEConv3D::~NEConv3D() = default; -void NEConv3D::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) +void NEConv3D::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info); auto f = std::make_unique(); - f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), + conv_info); _impl->op = std::move(f); - if(_impl->op != nullptr) + if (_impl->op != nullptr) { - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; } } -Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info) +Status NEConv3D::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv3dInfo &conv_info) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info)); @@ -72,7 +79,7 @@ Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, void NEConv3D::run() { - if(_impl->op != nullptr) + if (_impl->op != nullptr) { _impl->op->run(_impl->run_pack); } diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp index 535ac99001..84e8565aaf 100644 --- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp +++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp @@ -24,24 +24,26 @@ #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" namespace arm_compute { struct NEConvertFullyConnectedWeights::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() - : _impl(std::make_unique()) +NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique()) { } NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default; -void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void NEConvertFullyConnectedWeights::configure(const ITensor *input, + ITensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -51,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } @@ -64,4 +68,4 @@ void NEConvertFullyConnectedWeights::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 89e0e498c9..37958fc2e9 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuConv2d.h" @@ -43,34 +44,44 @@ struct NEConvolutionLayer::Impl { MemoryGroup memory_group{}; std::shared_ptr memory_manager{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - std::unique_ptr func{ nullptr }; + std::unique_ptr func{nullptr}; }; -NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } NEConvolutionLayer::~NEConvolutionLayer() = default; -void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void NEConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::GEMM: @@ -78,7 +89,8 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const case ConvolutionMethod::DIRECT: { auto f = std::make_unique(); - f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), + output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); _impl->op = std::move(f); break; } @@ -94,33 +106,46 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const break; } - if(_impl->op) + if (_impl->op) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); - switch(cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::GEMM: case ConvolutionMethod::GEMM_CONV2D: case ConvolutionMethod::DIRECT: - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, + weights_info, dilation, act_info, enable_fast_math, + num_groups)); break; case ConvolutionMethod::FFT: - ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); break; default: ARM_COMPUTE_ERROR("Not supported."); @@ -129,12 +154,17 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, - const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math); + return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math); } void NEConvolutionLayer::run() @@ -143,7 +173,7 @@ void NEConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(_impl->func) + if (_impl->func) { _impl->func->run(); } @@ -155,7 +185,7 @@ void NEConvolutionLayer::run() void NEConvolutionLayer::prepare() { - if(_impl->func) + if (_impl->func) { _impl->func->prepare(); } diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp index c2059e8e98..c975d3a5b5 100644 --- a/src/runtime/NEON/functions/NECopy.cpp +++ b/src/runtime/NEON/functions/NECopy.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NECopy.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuCopy.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NECopy::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NECopy::NECopy() - : _impl(std::make_unique()) +NECopy::NECopy() : _impl(std::make_unique()) { } -NECopy::NECopy(NECopy &&) = default; +NECopy::NECopy(NECopy &&) = default; NECopy &NECopy::operator=(NECopy &&) = default; NECopy::~NECopy() = default; diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp index cca8b400ee..a94b0882da 100644 --- a/src/runtime/NEON/functions/NECropResize.cpp +++ b/src/runtime/NEON/functions/NECropResize.cpp @@ -21,10 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/runtime/NEON/functions/NECropResize.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NECropKernel.h" @@ -35,18 +36,32 @@ namespace arm_compute NECropResize::~NECropResize() = default; NECropResize::NECropResize() - : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results() + : _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _crop(), + _scale(), + _crop_results(), + _scaled_results() { } -Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status NECropResize::validate(const ITensorInfo *input, + const ITensorInfo *boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), + box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, + extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -56,11 +71,17 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes return Status{}; } -void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void NECropResize::configure(const ITensor *input, + const ITensor *boxes, + const ITensor *box_ind, + ITensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -81,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I _scaled_results.reserve(_num_boxes); _scale.reserve(_num_boxes); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { auto crop_tensor = std::make_unique(); TensorInfo crop_result_info(1, DataType::F32); @@ -108,7 +129,7 @@ void NECropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { // Size of the crop box in _boxes and thus the shape of _crop_results[i] // may not be known until run-time and so the kernels cannot be configured until then. @@ -117,12 +138,15 @@ void NECropResize::run() NEScheduler::get().schedule(_crop[i].get(), Window::DimZ); // Scale the cropped image. - _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false }); + _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), + SamplingPolicy::TOP_LEFT, false}); _scaled_results[i]->allocator()->allocate(); _scale[i]->run(); // Copy scaled image into output. - std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i))); + std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), + _output->ptr_to_element(Coordinates(0, 0, 0, i))); } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index 439aff0840..3987370d9e 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -61,7 +62,8 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p deconv_pad_top += deconv_pad_y / 2; deconv_pad_bottom += deconv_pad_y / 2; - return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, + DimensionRoundingType::FLOOR); } } // namespace @@ -82,17 +84,24 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr memor { } -Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info, - bool enable_fast_math, const WeightsInfo &weights_info) +Status NEDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); - if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); } @@ -101,11 +110,13 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); } - auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info); + auto out_dims = + deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), + weights->dimension(width_idx), weights->dimension(height_idx), info); - if(bias != nullptr) + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -115,15 +126,18 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf } } - if(output->tensor_shape().total_size() > 0) + if (output->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); } uint32_t deconv_pad_x = 0; @@ -141,44 +155,61 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first); ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second); - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); // Do not perform upsampling when the operation uses unit stride in all dimensions const bool do_upsampling = stride_x != 1 || stride_y != 1; - const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - if(do_upsampling) + if (do_upsampling) { const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, + weights_info, Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); } else { - const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math)); + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, + Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); } return Status{}; } -void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math, const WeightsInfo &weights_info) +void NEDeconvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math, weights_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), + (bias == nullptr) ? nullptr : bias->info(), + output->info(), info, enable_fast_math, weights_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); + auto out_dims = deconvolution_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); @@ -191,7 +222,8 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con const unsigned int stride_y = info.stride().second; // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); @@ -199,12 +231,11 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); // setup the function to convolve the upscaled output - uint32_t deconv_pad_x = 0; - uint32_t deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), - stride_x, stride_y, - out_dims, deconv_pad_x, deconv_pad_y); - const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); + uint32_t deconv_pad_x = 0; + uint32_t deconv_pad_y = 0; + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); // Do not perform upsampling when the operation uses unit stride in all dimensions _do_upsampling = stride_x != 1 || stride_y != 1; @@ -216,12 +247,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con axis_data[1] = static_cast(height_idx); // Setup convolution and upsampling, if needed - if(_do_upsampling) + if (_do_upsampling) { _memory_group.manage(&_scaled_output); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); scale_out_info.set_data_layout(data_layout); _scaled_output.allocator()->init(scale_out_info); @@ -229,14 +260,17 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con // The padding amount can be given as input to the convolution layer. _upsample_f.configure(input, &_scaled_output, upsample_info); - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); _scaled_output.allocator()->allocate(); } else { - const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); - _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math); + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); } } @@ -246,7 +280,7 @@ void NEDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_upsampling) + if (_do_upsampling) { _upsample_f.run(); } @@ -255,7 +289,7 @@ void NEDeconvolutionLayer::run() void NEDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp index 1ec32074a5..766635dfa1 100644 --- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuCast.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NEDepthConvertLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEDepthConvertLayer::NEDepthConvertLayer() - : _impl(std::make_unique()) +NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique()) { } -NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; +NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default; NEDepthConvertLayer::~NEDepthConvertLayer() = default; @@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy); } -Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return cpu::CpuCast::validate(input, output, policy); @@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void NEDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp index f4a8a17e05..47564059ec 100644 --- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 4dabef3bd7..6c085645db 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDepthwiseConv2d.h" @@ -39,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default; struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl { - ITensor *src{ nullptr }; // SRC_0 - ITensor *dst{ nullptr }; // DST_0 - const ITensor *weights - { - nullptr - }; // SRC_1 - const ITensor *biases - { - nullptr - }; // SRC_2 + ITensor *src{nullptr}; // SRC_0 + ITensor *dst{nullptr}; // DST_0 + const ITensor *weights{nullptr}; // SRC_1 + const ITensor *biases{nullptr}; // SRC_2 Tensor permuted_input{}; // INT_0 Tensor permuted_weights{}; // INT_1 Tensor permuted_output{}; // INT_2 Tensor workspace{}; // INT_3 Tensor packed_weights{}; // INT_4 - std::shared_ptr op{ nullptr }; - bool is_prepared{ false }; - bool permute{ false }; + std::shared_ptr op{nullptr}; + bool is_prepared{false}; + bool permute{false}; }; -NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr memory_manager) +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal( + std::shared_ptr memory_manager) : _memory_group(memory_manager), _impl(std::make_unique()) { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure( + ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -82,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permute = is_nhwc; _impl->op = std::make_unique(); - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(), - _impl->dst->info(), info); + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + _impl->op->configure(_impl->src->info(), _impl->weights->info(), + _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info); // Configure pipeline ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); @@ -92,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6); - if(!is_activationlayer_enabled) + if (!is_activationlayer_enabled) { act_info_to_use = act_info; } - info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation }; + info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation}; auto dwc_optimized_func = std::make_unique(); - if(is_nhwc) + if (is_nhwc) { auto permute_input = std::make_unique(); auto permute_weights = std::make_unique(); @@ -122,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info()); // Configure optimized depthwise - dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info); + dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), + biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), + info); // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC); @@ -133,29 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: } else { - dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); + dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), + biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); } // Allocate memory based on the internal memory requirements experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace(); - _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment); - _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment); + _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8), + mem_req[0].alignment); + _impl->packed_weights.allocator()->init( + TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment); _memory_group.manage(&_impl->workspace); _memory_group.manage(&_impl->packed_weights); _impl->workspace.allocator()->allocate(); _impl->packed_weights.allocator()->allocate(); } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +Status +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -180,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { // Permute weights - if(_impl->permute) + if (_impl->permute) { _impl->permuted_weights.allocator()->allocate(); } - if(!_impl->permuted_weights.is_used()) + if (!_impl->permuted_weights.is_used()) { _impl->permuted_weights.allocator()->free(); } @@ -202,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl Tensor permuted_input{}; Tensor permuted_weights{}; Tensor permuted_output{}; - bool is_prepared{ false }; - bool is_nchw{ false }; - bool is_activationlayer_enabled{ false }; - const ITensor *weights{ nullptr }; - const ITensor *biases{ nullptr }; - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::shared_ptr op{ nullptr }; + bool is_prepared{false}; + bool is_nchw{false}; + bool is_activationlayer_enabled{false}; + const ITensor *weights{nullptr}; + const ITensor *biases{nullptr}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::shared_ptr op{nullptr}; }; NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric() @@ -217,14 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info); + _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), + info); _impl->src = input; _impl->dst = output; @@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = output; - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_input = std::make_unique(); auto permute_weights = std::make_unique(); @@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC); weights_to_use = &_impl->permuted_weights; - _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + _impl->permuted_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); output_to_use = &_impl->permuted_output; } auto depthwise_conv_kernel = std::make_unique(); - depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); + depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), + biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_output = std::make_unique(); permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U)); @@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( } } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -298,49 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr op{ nullptr }; + std::shared_ptr op{nullptr}; }; #endif // DOXYGEN_SKIP_THIS -void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation); - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), - output->info(), conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info, + depth_multiplier, act_info, dilation)); - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_shared(); - _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), - info); - switch(_impl->depth_conv_func) + _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info); + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; case DepthwiseConvolutionFunction::GENERIC: - _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; default: ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); } } -Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::run() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.run(); @@ -355,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run() void NEDepthwiseConvolutionLayer::prepare() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.prepare(); diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp index 83e0131c83..28d19d2950 100644 --- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/cpu/operators/CpuDequantize.h" namespace arm_compute { struct NEDequantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEDequantizationLayer::NEDequantizationLayer() - : _impl(std::make_unique()) +NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique()) { } NEDequantizationLayer::~NEDequantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp index 1da8b012b3..b347390162 100644 --- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp +++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include @@ -35,24 +36,36 @@ namespace arm_compute { NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false) + : _memory_group(std::move(memory_manager)), + _dequantize(), + _detection_post_process(), + _decoded_scores(), + _run_dequantize(false) { } -void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); - ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), - output_scores->info(), - num_detection->info(), info)); - ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); + ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, + num_detection, info); const ITensor *input_scores_to_use = input_scores; DetectionPostProcessLayerInfo info_to_use = info; _run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type()); - if(_run_dequantize) + if (_run_dequantize) { _memory_group.manage(&_decoded_scores); @@ -61,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c input_scores_to_use = &_decoded_scores; // Create a new info struct to avoid dequantizing in the CPP layer - std::array scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() }; - DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(), - scales_values, info.use_regular_nms(), info.detection_per_class(), false); + std::array scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), + info.scale_value_w()}; + DetectionPostProcessLayerInfo info_quantized( + info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), + info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false); info_to_use = info_quantized; } - _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use); + _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info_to_use); _decoded_scores.allocator()->allocate(); } -Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_scores, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type()); - if(run_dequantize) + if (run_dequantize) { TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, + output_boxes, output_classes, output_scores, + num_detection, info)); return Status{}; } @@ -90,7 +114,7 @@ void NEDetectionPostProcessLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Decode scores if necessary - if(_run_dequantize) + if (_run_dequantize) { _dequantize.run(); } diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index ef3d3d6055..f1c2cf969f 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -27,17 +27,18 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/cpu/operators/CpuDirectConv2d.h" namespace arm_compute { struct NEDirectConvolutionLayer::Impl { - ITensor *src{ nullptr }; - const ITensor *weights{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{nullptr}; + const ITensor *weights{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr memory_manager) @@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptrsrc = input; _impl->weights = weights; _impl->bias = bias; _impl->dst = output; _impl->op = std::make_unique(_memory_manager); - _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info); + _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), + conv_info, act_info); } -Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info); diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp index c958adf97c..685ef2d4d7 100644 --- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp +++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h" -#include "arm_compute/core/Validate.h" -#include "src/cpu/operators/CpuElementwise.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuElementwise.h" #include @@ -33,17 +34,16 @@ namespace arm_compute { struct NEElementwiseMax::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseMax::NEElementwiseMax() - : _impl(std::make_unique()) +NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique()) { } -NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; +NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default; NEElementwiseMax::~NEElementwiseMax() = default; @@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMax::validate(input1, input2, output); @@ -74,17 +77,16 @@ void NEElementwiseMax::run() struct NEElementwiseMin::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseMin::NEElementwiseMin() - : _impl(std::make_unique()) +NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique()) { } -NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; +NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default; NEElementwiseMin::~NEElementwiseMin() = default; @@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMin::validate(input1, input2, output); @@ -115,21 +120,23 @@ void NEElementwiseMin::run() struct NEElementwiseSquaredDiff::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() - : _impl(std::make_unique()) +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique()) { } -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff() = default; -void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseSquaredDiff::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output); @@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run() struct NEElementwiseDivision::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseDivision::NEElementwiseDivision() - : _impl(std::make_unique()) +NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique()) { } -NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; +NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default; NEElementwiseDivision::~NEElementwiseDivision() = default; -void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseDivision::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseDivision::validate(input1, input2, output); @@ -197,21 +212,23 @@ void NEElementwiseDivision::run() struct NEElementwisePower::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwisePower::NEElementwisePower() - : _impl(std::make_unique()) +NEElementwisePower::NEElementwisePower() : _impl(std::make_unique()) { } -NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; +NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default; NEElementwisePower::~NEElementwisePower() = default; -void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwisePower::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwisePower::validate(input1, input2, output); @@ -239,22 +259,22 @@ void NEElementwisePower::run() template struct NEElementwiseComparisonStatic::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr> op{nullptr}; }; template -NEElementwiseComparisonStatic::NEElementwiseComparisonStatic() - : _impl(std::make_unique()) +NEElementwiseComparisonStatic::NEElementwiseComparisonStatic() : _impl(std::make_unique()) { } template NEElementwiseComparisonStatic::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default; -template -NEElementwiseComparisonStatic &NEElementwiseComparisonStatic::operator=(NEElementwiseComparisonStatic &&) = default; -template +template +NEElementwiseComparisonStatic & +NEElementwiseComparisonStatic::operator=(NEElementwiseComparisonStatic &&) = default; +template NEElementwiseComparisonStatic::~NEElementwiseComparisonStatic() = default; template @@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic::configure(ITensor *input1, ITensor *inp } template -Status NEElementwiseComparisonStatic::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status NEElementwiseComparisonStatic::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) { return cpu::CpuElementwiseComparisonStatic::validate(input1, input2, output); } template -void NEElementwiseComparisonStatic::run() +void NEElementwiseComparisonStatic::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); @@ -285,17 +307,16 @@ void NEElementwiseComparisonStatic::run() struct NEElementwiseComparison::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseComparison::NEElementwiseComparison() - : _impl(std::make_unique()) +NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique()) { } -NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; +NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default; NEElementwiseComparison::~NEElementwiseComparison() = default; @@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso _impl->op->configure(input1->info(), input2->info(), output->info(), op); } -Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op) +Status NEElementwiseComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation op) { return cpu::CpuElementwiseComparison::validate(input1, input2, output, op); } diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp index a0674ec320..23a092c407 100644 --- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp +++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp @@ -22,7 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h" + #include "src/cpu/operators/CpuElementwiseUnary.h" + #include namespace arm_compute @@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary; template struct NEElementwiseUnaryLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr cpu_op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr cpu_op{nullptr}; }; template -NEElementwiseUnaryLayer::NEElementwiseUnaryLayer() - : _impl(std::make_unique()) +NEElementwiseUnaryLayer::NEElementwiseUnaryLayer() : _impl(std::make_unique()) { } template NEElementwiseUnaryLayer::~NEElementwiseUnaryLayer() = default; template NEElementwiseUnaryLayer::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default; -template +template NEElementwiseUnaryLayer &NEElementwiseUnaryLayer::operator=(NEElementwiseUnaryLayer &&) = default; template @@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer::validate(const ITensorInfo *input, const ITe } template -void NEElementwiseUnaryLayer::run() +void NEElementwiseUnaryLayer::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _impl->src); diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp index 343b817eba..fb75f9da29 100644 --- a/src/runtime/NEON/functions/NEFFT1D.cpp +++ b/src/runtime/NEON/functions/NEFFT1D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" @@ -37,7 +38,15 @@ namespace arm_compute NEFFT1D::~NEFFT1D() = default; NEFFT1D::NEFFT1D(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false) + : _memory_group(std::move(memory_manager)), + _digit_reverse_kernel(), + _fft_kernels(), + _scale_kernel(), + _digit_reversed_input(), + _digit_reverse_indices(), + _num_ffts(0), + _axis(0), + _run_scale(false) { } @@ -74,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & _fft_kernels.resize(_num_ffts); _axis = config.axis; - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -84,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels[i] = std::make_unique(); - _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, + fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; _scale_kernel = std::make_unique(); - is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -113,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); @@ -122,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { // All combinations are supported except real input with real output (i.e., both input channels set to 1) ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); @@ -140,13 +151,13 @@ void NEFFT1D::run() NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ)); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX)); } // Run output scaling - if(_run_scale) + if (_run_scale) { NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY); } diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp index ab422bd2ae..066909221d 100644 --- a/src/runtime/NEON/functions/NEFFT2D.cpp +++ b/src/runtime/NEON/functions/NEFFT2D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Scheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -33,7 +34,10 @@ namespace arm_compute NEFFT2D::~NEFFT2D() = default; NEFFT2D::NEFFT2D(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -78,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp index 0551d756fb..94f85e5ffa 100644 --- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp @@ -25,15 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" #include "src/core/NEON/kernels/NEFFTScaleKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" namespace arm_compute @@ -46,11 +47,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -102,8 +103,13 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr mem } NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default; -void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void NEFFTConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); @@ -115,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -137,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -158,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -166,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -193,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -205,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), + Coordinates(end_right, end_botton)); _reshaped_output.allocator()->allocate(); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -235,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -247,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(output, nullptr, act_info); } @@ -260,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co axis_data[1] = 1; } -Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); @@ -279,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); @@ -291,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); } @@ -313,7 +334,7 @@ void NEFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -331,17 +352,17 @@ void NEFFTConvolutionLayer::run() _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -349,10 +370,10 @@ void NEFFTConvolutionLayer::run() void NEFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -362,7 +383,7 @@ void NEFFTConvolutionLayer::prepare() const ITensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp index 43667783bf..bc1d5b7f5c 100644 --- a/src/runtime/NEON/functions/NEFill.cpp +++ b/src/runtime/NEON/functions/NEFill.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEFill.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuFill.h" #include @@ -32,15 +33,14 @@ namespace arm_compute { struct NEFill::Impl { - ITensor *tensor{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *tensor{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFill::NEFill() - : _impl(std::make_unique()) +NEFill::NEFill() : _impl(std::make_unique()) { } -NEFill::NEFill(NEFill &&) = default; +NEFill::NEFill(NEFill &&) = default; NEFill &NEFill::operator=(NEFill &&) = default; NEFill::~NEFill() = default; diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp index d633e340f8..a3ab9c3db4 100644 --- a/src/runtime/NEON/functions/NEFillBorder.cpp +++ b/src/runtime/NEON/functions/NEFillBorder.cpp @@ -25,17 +25,20 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute { -NEFillBorder::NEFillBorder() - : _border_handler(nullptr) +NEFillBorder::NEFillBorder() : _border_handler(nullptr) { } -void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorder::configure(ITensor *input, + unsigned int border_width, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value); _border_handler = std::make_unique(); diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp index f435842634..56db2be3fa 100644 --- a/src/runtime/NEON/functions/NEFlattenLayer.cpp +++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/cpu/operators/CpuFlatten.h" @@ -33,16 +34,15 @@ namespace arm_compute { struct NEFlattenLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFlattenLayer::NEFlattenLayer() - : _impl(std::make_unique()) +NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique()) { } -NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; +NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default; NEFlattenLayer::~NEFlattenLayer() = default; @@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique(); _impl->op->configure(_impl->src->info(), _impl->dst->info()); @@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return cpu::CpuFlatten::validate(input, output); diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp index d2dc48a159..112c93c478 100644 --- a/src/runtime/NEON/functions/NEFloor.cpp +++ b/src/runtime/NEON/functions/NEFloor.cpp @@ -24,22 +24,22 @@ #include "arm_compute/runtime/NEON/functions/NEFloor.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuFloor.h" namespace arm_compute { struct NEFloor::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFloor::NEFloor() - : _impl(std::make_unique()) +NEFloor::NEFloor() : _impl(std::make_unique()) { } -NEFloor::NEFloor(NEFloor &&) = default; +NEFloor::NEFloor(NEFloor &&) = default; NEFloor &NEFloor::operator=(NEFloor &&) = default; NEFloor::~NEFloor() = default; diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 891487efd3..2656d0fa0f 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuFullyConnected.h" @@ -38,80 +39,90 @@ using namespace arm_compute::experimental; struct NEFullyConnectedLayer::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_weights{ nullptr }; + const ITensor *original_weights{nullptr}; ITensorPack run_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - bool is_prepared{ false }; - bool dynamic_weights{ false }; + bool is_prepared{false}; + bool dynamic_weights{false}; }; NEFullyConnectedLayer::~NEFullyConnectedLayer() = default; -NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); _impl->weights_manager = weights_manager; } -void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +void NEFullyConnectedLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), - weights->info(), + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info, - weights_info)); + output->info(), fc_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info); _impl->op = std::make_unique(); _impl->original_weights = weights; _impl->is_prepared = false; - _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info, weights_info); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + fc_info, weights_info); - if(_impl->weights_manager != nullptr) + if (_impl->weights_manager != nullptr) { _impl->weights_manager->manage(_impl->original_weights); } _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); - - _impl->dynamic_weights = - !weights->info()->are_values_constant() && - fc_info.transpose_weights && - !fc_info.are_weights_reshaped && - !fc_info.retain_internal_weights; + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info, - const WeightsInfo &weights_info) +Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const FullyConnectedLayerInfo &fc_info, + const WeightsInfo &weights_info) { - return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, weights_info); + return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, + weights_info); } -Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +Status NEFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info); } void NEFullyConnectedLayer::run() { - if(!_impl->dynamic_weights) + if (!_impl->dynamic_weights) { prepare(); } @@ -122,7 +133,7 @@ void NEFullyConnectedLayer::run() void NEFullyConnectedLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -131,13 +142,13 @@ void NEFullyConnectedLayer::prepare() _impl->is_prepared = true; // Handle weights managed infrastructure - if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare // This is for cases where multiple functions share the same b (weights) // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference const ITensor *original_b = _impl->original_weights; - if(!original_b->is_used()) + if (!original_b->is_used()) { _impl->weights_manager->pre_mark_as_unused(original_b); } diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp index 6612845d86..f5b8b57dac 100644 --- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp +++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" @@ -35,29 +36,42 @@ namespace arm_compute { NEFuseBatchNormalization::~NEFuseBatchNormalization() = default; -NEFuseBatchNormalization::NEFuseBatchNormalization() - : _fuse_bn_kernel() +NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel() { } -void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalization::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, - bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); _fuse_bn_kernel = std::make_unique(); - _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); } -Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void NEFuseBatchNormalization::run() diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index e51f2f9eb6..934a8250cc 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemm.h" @@ -39,12 +40,12 @@ namespace arm_compute struct NEGEMM::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_b{ nullptr }; - bool is_prepared{ false }; + const ITensor *original_b{nullptr}; + bool is_prepared{false}; ITensorPack run_pack{}; ITensorPack prep_pack{}; @@ -61,10 +62,17 @@ NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager * NEGEMM::~NEGEMM() = default; -void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info) +void NEGEMM::configure(const ITensor *a, + const ITensor *b, + const ITensor *c, + ITensor *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, + d->info(), alpha, beta, gemm_info)); // Check if we need to reshape the matrix B only on the first run _impl->is_prepared = false; @@ -73,24 +81,32 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe // Make the B matrix dynamic values. auto b_info_to_use = b->info()->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } - _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, + gemm_info); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_SRC_2, c }, { ACL_DST, d } }; - _impl->prep_pack = { { ACL_SRC_1, b }, { ACL_SRC_2, c } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}}; + _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status NEGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { // Make the B matrix dynamic values. auto b_to_use = b->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_to_use->set_are_values_constant(false); } @@ -98,8 +114,14 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info); } -Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, - float alpha, float beta, const GEMMInfo &gemm_info) +Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha, beta); return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info); @@ -115,15 +137,15 @@ void NEGEMM::run() void NEGEMM::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->original_b->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index 42b8b70405..6cca02eea9 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmDirectConv2d.h" @@ -35,25 +36,25 @@ using namespace arm_compute::experimental; struct NEGEMMConv2d::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; MemoryGroup memory_group{}; - bool is_prepared{ false }; + bool is_prepared{false}; experimental::MemoryRequirements aux_mem_req{}; }; -NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) - : _impl(std::make_unique()) +NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(memory_manager); } NEGEMMConv2d::~NEGEMMConv2d() = default; -void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) +void NEGEMMConv2d::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -61,15 +62,21 @@ void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITens _impl->is_prepared = false; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + info); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } }; - _impl->prep_pack = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info) +Status NEGEMMConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &info) { return OperatorType::validate(input, weights, biases, output, info); } @@ -84,15 +91,15 @@ void NEGEMMConv2d::run() void NEGEMMConv2d::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->weights->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index fe3ea6a767..c8f65d2fd9 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmConv2d.h" @@ -36,17 +37,18 @@ namespace arm_compute { struct NEGEMMConvolutionLayer::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager, IWeightsManager *weights_manager) +NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->weights_manager = weights_manager; @@ -54,37 +56,61 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptrweights = weights; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), + conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input }, - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, - { TensorType::ACL_DST, output } - }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); } -Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math) +Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { - return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math); + return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, + dilation, act_info, enable_fast_math); } void NEGEMMConvolutionLayer::run() @@ -96,7 +122,7 @@ void NEGEMMConvolutionLayer::run() void NEGEMMConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 453d3cedef..44bfc6a51e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -29,8 +29,8 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" -#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" using namespace arm_compute::experimental; @@ -39,18 +39,19 @@ namespace arm_compute { struct NEGEMMLowpMatrixMultiplyCore::Impl { - const ITensor *b{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *b{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->weights_manager = weights_manager; @@ -58,41 +59,41 @@ NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } _impl->b = b; _impl->op = std::make_unique(); - _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, a }, - { TensorType::ACL_SRC_1, b }, - { TensorType::ACL_SRC_2, c }, - { TensorType::ACL_DST, output } - }; - _impl->prep_pack = - { - { TensorType::ACL_SRC_1, b }, - { TensorType::ACL_SRC_2, c } - }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), + gemm_info); + _impl->run_pack = {{TensorType::ACL_SRC_0, a}, + {TensorType::ACL_SRC_1, b}, + {TensorType::ACL_SRC_2, c}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { // Make the B matrix dynamic values. auto b_info_to_use = b->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } @@ -109,15 +110,15 @@ void NEGEMMLowpMatrixMultiplyCore::run() void NEGEMMLowpMatrixMultiplyCore::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->b->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index 7e1de3c257..8178003b5e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -25,45 +25,48 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuGemmLowpOutputStage.h" namespace arm_compute { struct NEGEMMLowpOutputStage::Impl { - const ITensor *src{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; + const ITensor *src{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; ITensorPack run_pack{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; }; -NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() - : _impl(std::make_unique()) +NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique()) { } NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default; -void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info) +void NEGEMMLowpOutputStage::configure(const ITensor *input, + const ITensor *bias, + ITensor *output, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON( + NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); _impl->src = input; _impl->bias = bias; _impl->dst = output; _impl->op = std::make_unique(); _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info); - _impl->run_pack = - { - { TensorType::ACL_SRC, _impl->src }, - { TensorType::ACL_BIAS, _impl->bias }, - { TensorType::ACL_DST, _impl->dst } - }; + _impl->run_pack = { + {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}}; } -Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info); } diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp index f5d19c769e..62b8cfa48b 100644 --- a/src/runtime/NEON/functions/NEGather.cpp +++ b/src/runtime/NEON/functions/NEGather.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEGather.h" -#include "src/core/NEON/kernels/NEGatherKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEGatherKernel.h" #include diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp index 1c0e736766..1022b4153e 100644 --- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp +++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp @@ -25,11 +25,12 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" -#include "src/core/helpers/AutoConfiguration.h" namespace arm_compute { @@ -68,42 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptrinfo(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); _compute_anchors = std::make_unique(); - _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors->configure(anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -117,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(&_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -131,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d Tensor *anchors_to_use = &_all_anchors; Tensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -154,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -174,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -187,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _memory_group.manage(&_proposals_4_roi_values); - const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()); - _cpp_nms.configure(&_scores_flattened /*scores_in*/, - _all_proposals_to_use /*boxes_in,*/, - nullptr /* batch_splits_in*/, - scores_out /* scores_out*/, - &_proposals_4_roi_values /*boxes_out*/, - &_classes_nms_unused /*classes*/, - nullptr /*batch_splits_out*/, - &_keeps_nms_unused /*keeps*/, - num_valid_proposals /* keeps_size*/, - box_nms_info); + const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height()); + _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/, + nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/, + &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/, + num_valid_proposals /* keeps_size*/, box_nms_info); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); @@ -205,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -218,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -229,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -311,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -330,7 +373,7 @@ void NEGenerateProposalsLayer::run() NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -339,7 +382,7 @@ void NEGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors.run(); _dequantize_deltas.run(); @@ -348,7 +391,7 @@ void NEGenerateProposalsLayer::run() // Build the boxes _bounding_box.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals.run(); } diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp index 822dcf491c..78218cbdee 100644 --- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h" @@ -34,7 +35,13 @@ namespace arm_compute NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default; NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + : _memory_group(std::move(memory_manager)), + _normalization_kernel(), + _is_nchw(false), + _permute_input(), + _permute_output(), + _permuted_input(), + _permuted_output() { } @@ -43,14 +50,14 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon); const DataLayout data_layout = input->info()->data_layout(); - const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }; + const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}; // Configure Kernels _is_nchw = data_layout == DataLayout::NCHW; _normalization_kernel = std::make_unique(); - if(!_is_nchw) + if (!_is_nchw) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -72,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl } } -Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) +Status NEInstanceNormalizationLayer::validate( + const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) { - return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), - &output->clone()->set_data_layout(DataLayout::NCHW), - InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }); + return NEInstanceNormalizationLayerKernel::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), + InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}); } void NEInstanceNormalizationLayer::run() @@ -84,7 +92,7 @@ void NEInstanceNormalizationLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Permute input - if(!_is_nchw) + if (!_is_nchw) { _permute_input.run(); } @@ -92,7 +100,7 @@ void NEInstanceNormalizationLayer::run() NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ); // Permute output - if(!_is_nchw) + if (!_is_nchw) { _permute_output.run(); } diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp index c3ecfb430f..b7f6203efd 100644 --- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp +++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -69,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index 428cdf8c04..1a08cdeb06 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -24,11 +24,12 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/common/LSTMParams.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -39,42 +40,122 @@ using namespace arm_compute::utils::info_helpers; NELSTMLayer::~NELSTMLayer() = default; NELSTMLayer::NELSTMLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(), - _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), - _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(), - _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), - _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), - _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), - _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), - _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), - _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), _is_layer_norm_lstm(false) { } -void NELSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *output_state_in, const ITensor *cell_state_in, - ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void NELSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *output_state_in, + const ITensor *cell_state_in, + ITensor *scratch_buffer, + ITensor *output_state_out, + ITensor *cell_state_out, + ITensor *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); - ARM_COMPUTE_LOG_PARAMS(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output, - lstm_params, activation_info, cell_threshold, projection_threshold); + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -83,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input, build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); @@ -116,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input, _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); Tensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -138,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input, { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), + &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, + ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -161,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input, // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); Tensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); @@ -183,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out4); - _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -201,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input, _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), + &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -228,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input, _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, + &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2); _memory_group.manage(&_cell_state_out3); @@ -237,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_out4); _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); Tensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), + &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold)); + _cell_clip.configure(&_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -281,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, + &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); Tensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -304,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input, { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), + &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, + ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -335,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_state_activation.allocator()->allocate(); output_gate_out->allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -358,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input, // Vector for holding the tensors to store in scratch buffer std::vector scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -372,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input, output_gate_out->allocator()->allocate(); } -Status NELSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status NELSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -413,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -434,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -445,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -465,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -499,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, - -cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } // Validate output gate tmp std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), + &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -590,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -611,12 +775,12 @@ void NELSTMLayer::run() _concat_inputs_forget_gate.run(); _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -624,15 +788,17 @@ void NELSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { - if(_ones.info()->data_type() == DataType::F16) + if (_ones.info()->data_type() == DataType::F16) { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } else { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } _subtract_input_gate.run(); } @@ -640,13 +806,13 @@ void NELSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -659,7 +825,7 @@ void NELSTMLayer::run() _transpose_cell_state.run(); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); @@ -671,18 +837,18 @@ void NELSTMLayer::run() _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -693,10 +859,10 @@ void NELSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -710,10 +876,10 @@ void NELSTMLayer::run() void NELSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp index cfdeb000e0..41f9c3d700 100644 --- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp +++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,36 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit NELSTMLayerQuantized::~NELSTMLayerQuantized() = default; NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(), - _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), - _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), - _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), - _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), - _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add1(), + _add2(), + _mul1(), + _mul2(), + _mul3(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state1(), + _cell_state2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), _is_prepared(false) { } void NELSTMLayerQuantized::configure(const ITensor *input, - const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out) + const ITensor *input_to_input_weights, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_input_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *input_gate_bias, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + ITensor *cell_state_in, + const ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); - - ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -83,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input, const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -100,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _output_gate_bias = output_gate_bias; // Weights concatenation - std::vector inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights }; - std::vector recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights }; + std::vector inputs_weights_vector{input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights}; + std::vector recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights}; - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY); - std::vector weights_vector{ &_recurrent_weights, &_input_weights }; - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + std::vector weights_vector{&_recurrent_weights, &_input_weights}; + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(weights_vector, &_weights, Window::DimX); _transpose_weights.configure(&_weights, &_weights_transposed); // Input concatenation - std::vector input_vector{ input, output_state_in }; + std::vector input_vector{input, output_state_in}; _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(input_vector, &_input, Window::DimX); // Bias concatenation - std::vector bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias }; + std::vector bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias}; _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32)); _concat_bias.configure(bias_vector, &_bias, Window::DimX); // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -137,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -159,64 +238,80 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state1); - _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state2); - _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); @@ -226,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -246,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input, } Status NELSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->dimension(0); const int batch_size = input->dimension(1); @@ -266,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -310,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector weights_vector; @@ -320,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -346,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -357,7 +494,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int32_t output_multiplier = 0; int32_t output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage GEMMLowpOutputStageInfo info; @@ -372,68 +510,91 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -442,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -508,7 +669,7 @@ void NELSTMLayerQuantized::run() void NELSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp index 92dcf15791..0013a521d1 100644 --- a/src/runtime/NEON/functions/NELogical.cpp +++ b/src/runtime/NEON/functions/NELogical.cpp @@ -25,6 +25,7 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NELogicalKernel.h" @@ -32,15 +33,14 @@ namespace arm_compute { struct LogicalArgs { - std::unique_ptr kernel{ nullptr }; + std::unique_ptr kernel{nullptr}; ITensorPack pack{}; }; struct NELogicalAnd::Impl : public LogicalArgs { }; -NELogicalAnd::NELogicalAnd() - : _impl(std::make_unique()) +NELogicalAnd::NELogicalAnd() : _impl(std::make_unique()) { } NELogicalAnd::~NELogicalAnd() = default; @@ -72,8 +72,7 @@ void NELogicalAnd::run() struct NELogicalOr::Impl : public LogicalArgs { }; -NELogicalOr::NELogicalOr() - : _impl(std::make_unique()) +NELogicalOr::NELogicalOr() : _impl(std::make_unique()) { } NELogicalOr::~NELogicalOr() = default; @@ -105,8 +104,7 @@ void NELogicalOr::run() struct NELogicalNot::Impl : public LogicalArgs { }; -NELogicalNot::NELogicalNot() - : _impl(std::make_unique()) +NELogicalNot::NELogicalNot() : _impl(std::make_unique()) { } NELogicalNot::~NELogicalNot() = default; diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp index 58640f40ea..31898bafc4 100644 --- a/src/runtime/NEON/functions/NEMatMul.cpp +++ b/src/runtime/NEON/functions/NEMatMul.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuMatMul.h" @@ -33,23 +34,27 @@ namespace arm_compute { struct NEMatMul::Impl { - const ITensor *lhs{ nullptr }; - const ITensor *rhs{ nullptr }; - ITensor *output{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *lhs{nullptr}; + const ITensor *rhs{nullptr}; + ITensor *output{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; WorkspaceData workspace_tensors{}; ITensorPack run_pack{}; }; -NEMatMul::NEMatMul() - : _impl(std::make_unique()) +NEMatMul::NEMatMul() : _impl(std::make_unique()) { } NEMatMul::~NEMatMul() = default; -void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void NEMatMul::configure(ITensor *lhs, + ITensor *rhs, + ITensor *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { _impl->lhs = lhs; _impl->rhs = rhs; @@ -58,11 +63,16 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output); _impl->op = std::make_unique(); _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info); - _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +Status NEMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info); } diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp index 97ddaea41d..c3861afd2c 100644 --- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" #include "src/cpu/operators/CpuMaxUnpooling.h" @@ -35,20 +36,22 @@ namespace arm_compute { struct NEMaxUnpoolingLayer::Impl { - const ITensor *src{ nullptr }; - const ITensor *indices{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + const ITensor *indices{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default; -NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() - : _fill_func(), _impl() +NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl() { } -void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info) +void NEMaxUnpoolingLayer::configure(ITensor *input, + ITensor *indices, + ITensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); @@ -64,7 +67,10 @@ void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *o _impl->op->configure(input->info(), indices->info(), output->info(), pool_info); } -Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info)); diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp index 7626aa0db2..dec0dde56d 100644 --- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" -#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index d3b1696335..d6d2e9dc46 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NENormalizationLayerKernel.h" @@ -61,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons _input_squared.allocator()->allocate(); } -Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status NENormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); return Status{}; } @@ -78,4 +82,4 @@ void NENormalizationLayer::run() _multiply_f.run(); NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY); } -} \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp index 80c5690a4e..963e68bac7 100644 --- a/src/runtime/NEON/functions/NEPReluLayer.cpp +++ b/src/runtime/NEON/functions/NEPReluLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuPRelu.h" namespace arm_compute @@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu; struct NEPReluLayer::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPReluLayer::NEPReluLayer() - : _impl(std::make_unique()) +NEPReluLayer::NEPReluLayer() : _impl(std::make_unique()) { } -NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; +NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default; NEPReluLayer::~NEPReluLayer() = default; diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp index 8bacdd3002..253566df0f 100644 --- a/src/runtime/NEON/functions/NEPadLayer.cpp +++ b/src/runtime/NEON/functions/NEPadLayer.cpp @@ -23,13 +23,13 @@ */ #include "arm_compute/runtime/NEON/functions/NEPadLayer.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEPadLayerKernel.h" namespace arm_compute { @@ -38,9 +38,9 @@ namespace uint32_t last_padding_dimension(const PaddingList &padding) { int last_padding_dim = padding.size() - 1; - for(; last_padding_dim >= 0; --last_padding_dim) + for (; last_padding_dim >= 0; --last_padding_dim) { - if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) + if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) { break; } @@ -52,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding) NEPadLayer::~NEPadLayer() = default; NEPadLayer::NEPadLayer() - : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results() + : _copy_function(), + _pad_kernel(), + _mode(), + _padding(), + _num_dimensions(0), + _slice_functions(), + _concat_functions(), + _slice_results(), + _concat_results() { } -void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value) +void NEPadLayer::configure_constant_mode(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value) { _pad_kernel = std::make_unique(); _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT); @@ -85,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu Coordinates ends_after{}; Coordinates strides{}; ITensor *prev = input; - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again. - if(i > 0) + if (i > 0) { strides.set(i - 1, 1); } - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { // Set the starts, ends, and strides values for the current dimension. // Due to the bit masks passed to strided slice, the values below the current dimension in // starts and ends will be ignored so do not need to be modified. - if(_mode == PaddingMode::REFLECT) + if (_mode == PaddingMode::REFLECT) { starts_before.set(i, _padding[i].first); ends_before.set(i, 0); @@ -124,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu // Reflect the input values for the padding before and after the input. std::vector concat_vector; - if(_padding[i].first > 0) + if (_padding[i].first > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before); + _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, + begin_mask_before, end_mask_before); concat_vector.emplace_back(&_slice_results[2 * i]); } else @@ -138,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } concat_vector.push_back(prev); - if(_padding[i].second > 0) + if (_padding[i].second > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after); + _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, + strides, begin_mask_after, end_mask_after); concat_vector.emplace_back(&_slice_results[2 * i + 1]); } else @@ -154,12 +167,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu // Concatenate the padding before and after with the input. ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i]; out->info()->set_quantization_info(output->info()->quantization_info()); - for(auto &v : concat_vector) + for (auto &v : concat_vector) { v->info()->set_quantization_info(input->info()->quantization_info()); } _concat_functions[i].configure(concat_vector, out, i); - if(i != _num_dimensions - 1) + if (i != _num_dimensions - 1) { _concat_results[i].allocator()->allocate(); } @@ -170,7 +183,11 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } -void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayer::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); @@ -178,15 +195,16 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p _padding = padding; _mode = mode; - const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); + const TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape)); // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied. _num_dimensions = last_padding_dimension(padding) + 1; - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -210,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p } } -Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { @@ -231,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < padding.size(); ++i) + for (uint32_t i = 0; i < padding.size(); ++i) { - if(mode == PaddingMode::REFLECT) + if (mode == PaddingMode::REFLECT) { ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i)); @@ -256,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, void NEPadLayer::run() { - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -268,15 +290,15 @@ void NEPadLayer::run() case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { - if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) + if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) { _slice_functions[2 * i].run(); } - if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) + if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) { _slice_functions[2 * i + 1].run(); } diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp index 517b86a1cb..80cd04ce6c 100644 --- a/src/runtime/NEON/functions/NEPermute.cpp +++ b/src/runtime/NEON/functions/NEPermute.cpp @@ -24,19 +24,19 @@ #include "arm_compute/runtime/NEON/functions/NEPermute.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuPermute.h" namespace arm_compute { struct NEPermute::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPermute::NEPermute() - : _impl(std::make_unique()) +NEPermute::NEPermute() : _impl(std::make_unique()) { } diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp index ad83a26beb..97155a9e74 100644 --- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp +++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuMul.h" #include @@ -32,32 +33,42 @@ namespace arm_compute { struct NEPixelWiseMultiplication::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPixelWiseMultiplication::NEPixelWiseMultiplication() - : _impl(std::make_unique()) +NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique()) { } NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default; -Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } -void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +void NEPixelWiseMultiplication::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, + act_info); } void NEPixelWiseMultiplication::run() @@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run() struct NEComplexPixelWiseMultiplication::Impl { - ITensor *src_0{ nullptr }; - ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src_0{nullptr}; + ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() - : _impl(std::make_unique()) +NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique()) { } NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default; -Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return cpu::CpuComplexMul::validate(input1, input2, output, act_info); } -void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEComplexPixelWiseMultiplication::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp index 53f9dbf0a2..e017e8c21d 100644 --- a/src/runtime/NEON/functions/NEPooling3dLayer.cpp +++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuPool3d.h" @@ -33,9 +34,9 @@ namespace arm_compute { struct NEPooling3dLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -43,8 +44,7 @@ struct NEPooling3dLayer::Impl NEPooling3dLayer::~NEPooling3dLayer() = default; -NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } @@ -56,11 +56,12 @@ void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Po _impl->op = std::make_unique(); _impl->op->configure(input->info(), output->info(), pool_info); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +Status +NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) { return cpu::CpuPool3d::validate(input, output, pool_info); } @@ -72,4 +73,4 @@ void NEPooling3dLayer::run() _impl->op->run(_impl->run_pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index 5a3b9c5e7e..eb9125be3c 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuPool2d.h" @@ -33,10 +34,10 @@ namespace arm_compute { struct NEPoolingLayer::Impl { - ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - ITensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{nullptr}; + ITensor *dst{nullptr}; + ITensor *indices{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl NEPoolingLayer::~NEPoolingLayer() = default; -NEPoolingLayer::NEPoolingLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEPoolingLayer::NEPoolingLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } @@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay _impl->op = std::make_unique(); _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, + {TensorType::ACL_DST_0, _impl->dst}, + {TensorType::ACL_DST_1, _impl->indices}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status NEPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return cpu::CpuPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp index aba09239cf..dbb6bf9df1 100644 --- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp +++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp @@ -27,15 +27,19 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h" namespace arm_compute { -void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayer::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); @@ -44,7 +48,10 @@ void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, IT _kernel = std::move(k); } -Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return NEPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index 2caaea02d8..dd78d10d16 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -27,13 +27,14 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" namespace arm_compute @@ -41,12 +42,19 @@ namespace arm_compute using namespace arm_compute::utils::info_helpers; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -55,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } @@ -98,14 +103,12 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst) void NEQLSTMLayer::TensorCopyKernel::run() { - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); } NEQLSTMLayer::~NEQLSTMLayer() = default; @@ -191,10 +194,17 @@ NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr memory_manager) _memory_group = MemoryGroup(std::move(memory_manager)); } -void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, - Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, + NEGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensor *mm_input, + const ITensor *mm_weights, + const ITensor *bias, + Tensor *mm_res, + Tensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -206,66 +216,87 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp mm.configure(mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void NEQLSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *cell_state_in, ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, +void NEQLSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *cell_state_in, + ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out, + ITensor *output, const LSTMParams &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); // Set lstm parameters LSTMParams lstm_params_info{}; build_lstm_params_tensor_info(lstm_params, &lstm_params_info); - _input_to_forget_weights_transposed.info()->set_quantization_info(input_to_forget_weights->info()->quantization_info()); + _input_to_forget_weights_transposed.info()->set_quantization_info( + input_to_forget_weights->info()->quantization_info()); _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info()); - _input_to_output_weights_transposed.info()->set_quantization_info(input_to_output_weights->info()->quantization_info()); - _recurrent_to_forget_weights_transposed.info()->set_quantization_info(recurrent_to_forget_weights->info()->quantization_info()); - _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info()); - _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info()); - - if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + _input_to_output_weights_transposed.info()->set_quantization_info( + input_to_output_weights->info()->quantization_info()); + _recurrent_to_forget_weights_transposed.info()->set_quantization_info( + recurrent_to_forget_weights->info()->quantization_info()); + _recurrent_to_cell_weights_transposed.info()->set_quantization_info( + recurrent_to_cell_weights->info()->quantization_info()); + _recurrent_to_output_weights_transposed.info()->set_quantization_info( + recurrent_to_output_weights->info()->quantization_info()); + + if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) { _convert_input_to_forget_weights_to_qsymm8 = true; // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32 - _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) - .set_data_layout(input_to_forget_weights->info()->data_layout())); + _input_to_forget_weights_f32.allocator()->init( + TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) + .set_data_layout(input_to_forget_weights->info()->data_layout())); // Setup the quantize output tensor to go from F32 -> QSYMM8 - _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) - .set_data_layout(input_to_forget_weights->info()->data_layout()) - .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); + _input_to_forget_weights_symm8.allocator()->init( + (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) + .set_data_layout(input_to_forget_weights->info()->data_layout()) + .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32); _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8); - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); } else { - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); } const int batch_size = input->info()->dimension(1); @@ -277,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input, const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform(); _projection_bias = lstm_params.projection_bias(); - _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights; + _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + ? &_input_to_forget_weights_symm8 + : input_to_forget_weights; _input_to_cell_weights = input_to_cell_weights; _input_to_output_weights = input_to_output_weights; _recurrent_to_forget_weights = recurrent_to_forget_weights; @@ -287,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input, // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -309,22 +342,25 @@ void NEQLSTMLayer::configure(const ITensor *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); _input_to_input_reduction = std::make_unique(); _recurrent_to_input_reduction = std::make_unique(); - _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } _input_to_forget_reduction = std::make_unique(); @@ -334,19 +370,31 @@ void NEQLSTMLayer::configure(const ITensor *input, _input_to_output_reduction = std::make_unique(); _recurrent_to_output_reduction = std::make_unique(); - _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { _projection_reduction = std::make_unique(); - _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction->configure( + _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, + ConvertPolicy::SATURATE); } } @@ -354,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input, _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed); _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed); _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed); } @@ -375,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input, const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, - &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, + &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, + gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, forget_activation_input); forget_activation_input->allocator()->allocate(); @@ -417,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed, + &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); - _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, cell_activation_input); cell_activation_input->allocator()->allocate(); @@ -454,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -469,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input, } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, + gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } Tensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, input_activation_input); input_activation_input->allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, - &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, - mm_out_info, output_outstage_info); - - _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, + &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); + + _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, + gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } Tensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, output_activation_input); output_activation_input->allocator()->allocate(); @@ -576,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -598,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -609,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input, _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ITensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -638,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input, accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, + ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -672,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input, _copy_output.configure(output_state_out, output); } -Status NEQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status NEQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -694,22 +812,27 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, + DataType::QSYMM8); // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED if (input_to_forget_weights->data_type() == DataType::QSYMM8) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); } ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -728,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -755,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -763,60 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), input_to_cell_weights->quantization_info()); - const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, input_to_output_weights->data_type(), input_to_output_weights->quantization_info()); - const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); - const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_cell_weights->data_type(), recurrent_to_cell_weights->quantization_info()); - const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_output_weights->data_type(), recurrent_to_output_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), + input_to_cell_weights->quantization_info()); + const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_output_weights->data_type(), + input_to_output_weights->quantization_info()); + const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); + const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_cell_weights->data_type(), + recurrent_to_cell_weights->quantization_info()); + const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_output_weights->data_type(), + recurrent_to_output_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); + if (!lstm_params.has_cifg_opt()) { - const TensorInfo recurrent_to_input_weights_transposed(TensorShape(num_units, output_size), 1, - recurrent_to_forget_weights->data_type(), lstm_params.recurrent_to_input_weights()->quantization_info()); + const TensorInfo recurrent_to_input_weights_transposed( + TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), + lstm_params.recurrent_to_input_weights()->quantization_info()); const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1, - lstm_params.input_to_input_weights()->data_type(), lstm_params.input_to_input_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); + lstm_params.input_to_input_weights()->data_type(), + lstm_params.input_to_input_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -829,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -859,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -882,94 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED if (input_to_forget_weights->data_type() == DataType::QSYMM8) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, + lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -977,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -1080,14 +1319,14 @@ void NEQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY); } @@ -1102,7 +1341,7 @@ void NEQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY); } @@ -1110,7 +1349,7 @@ void NEQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -1122,14 +1361,14 @@ void NEQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY); } @@ -1142,7 +1381,7 @@ void NEQLSTMLayer::run() _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1153,14 +1392,14 @@ void NEQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY); } @@ -1173,31 +1412,31 @@ void NEQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1209,9 +1448,9 @@ void NEQLSTMLayer::run() void NEQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_convert_input_to_forget_weights_to_qsymm8) + if (_convert_input_to_forget_weights_to_qsymm8) { _input_to_forget_weights_f32.allocator()->allocate(); _input_to_forget_weights_symm8.allocator()->allocate(); @@ -1234,28 +1473,25 @@ void NEQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); } else { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - ITensorPack packII = - { - { TensorType::ACL_SRC, _input_to_input_weights }, - { TensorType::ACL_DST, &_input_to_input_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII); + ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights}, + {TensorType::ACL_DST, &_input_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, + _input_to_input_reduction->window(), packII); - ITensorPack packRI = - { - { TensorType::ACL_SRC, _recurrent_to_input_weights }, - { TensorType::ACL_DST, &_recurrent_to_input_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI); + ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights}, + {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, + _recurrent_to_input_reduction->window(), packRI); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1271,58 +1507,44 @@ void NEQLSTMLayer::prepare() _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - ITensorPack packIF = - { - { TensorType::ACL_SRC, _input_to_forget_weights }, - { TensorType::ACL_DST, &_input_to_forget_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF); - - ITensorPack packRF = - { - { TensorType::ACL_SRC, _recurrent_to_forget_weights }, - { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF); - - ITensorPack packIC = - { - { TensorType::ACL_SRC, _input_to_cell_weights }, - { TensorType::ACL_DST, &_input_to_cell_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC); - - ITensorPack packRC = - { - { TensorType::ACL_SRC, _recurrent_to_cell_weights }, - { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC); - - ITensorPack packIO = - { - { TensorType::ACL_SRC, _input_to_output_weights }, - { TensorType::ACL_DST, &_input_to_output_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO); - - ITensorPack packRO = - { - { TensorType::ACL_SRC, _recurrent_to_output_weights }, - { TensorType::ACL_DST, &_recurrent_to_output_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO); - - if(_has_projection) + ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights}, + {TensorType::ACL_DST, &_input_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, + _input_to_forget_reduction->window(), packIF); + + ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights}, + {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, + _recurrent_to_forget_reduction->window(), packRF); + + ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights}, + {TensorType::ACL_DST, &_input_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), + packIC); + + ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights}, + {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, + _recurrent_to_cell_reduction->window(), packRC); + + ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights}, + {TensorType::ACL_DST, &_input_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, + _input_to_output_reduction->window(), packIO); + + ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights}, + {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, + _recurrent_to_output_reduction->window(), packRO); + + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - ITensorPack pack = - { - { TensorType::ACL_SRC, _projection_weights }, - { TensorType::ACL_DST, &_projection_eff_bias } - }; - NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack); - if(_projection_bias != nullptr) + ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights}, + {TensorType::ACL_DST, &_projection_eff_bias}}; + NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), + pack); + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1332,7 +1554,7 @@ void NEQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp index dad246ac89..9b72783c97 100644 --- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/cpu/operators/CpuQuantize.h" namespace arm_compute { struct NEQuantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEQuantizationLayer::NEQuantizationLayer() - : _impl(std::make_unique()) +NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique()) { } NEQuantizationLayer::~NEQuantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp index a66ef3d27a..2824693800 100644 --- a/src/runtime/NEON/functions/NERNNLayer.cpp +++ b/src/runtime/NEON/functions/NERNNLayer.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -37,13 +38,26 @@ namespace arm_compute NERNNLayer::~NERNNLayer() = default; NERNNLayer::NERNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_f(), + _activation(), + _fully_connected(memory_manager), + _copy_f(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } -Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status NERNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -60,24 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, +void NERNNLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *recurrent_weights, + const ITensor *bias, + ITensor *hidden_state, + ITensor *output, ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), + hidden_state->info()->dimension(idx_height)); _is_prepared = false; @@ -125,7 +149,7 @@ void NERNNLayer::run() void NERNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp index a9bdb50d95..68bb5d5ef3 100644 --- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp +++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp @@ -29,14 +29,20 @@ namespace arm_compute { -Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayer::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp index a24f2aac50..babec4aa92 100644 --- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" @@ -31,17 +33,22 @@ namespace arm_compute { NEROIPoolingLayer::~NEROIPoolingLayer() = default; -NEROIPoolingLayer::NEROIPoolingLayer() - : _roi_kernel() +NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel() { } -Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayer::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); @@ -53,4 +60,4 @@ void NEROIPoolingLayer::run() { NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp index a6f7be8be0..95492df126 100644 --- a/src/runtime/NEON/functions/NERange.cpp +++ b/src/runtime/NEON/functions/NERange.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NERange.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NERangeKernel.h" @@ -31,8 +32,7 @@ namespace arm_compute { NERange::~NERange() = default; -NERange::NERange() - : _kernel() +NERange::NERange() : _kernel() { } @@ -52,4 +52,4 @@ void NERange::run() { NEScheduler::get().schedule(_kernel.get(), Window::DimX); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index 9f96479295..d37cf4a8d0 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -25,21 +25,24 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -47,29 +50,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -91,11 +94,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax NEReduceMean::~NEReduceMean() = default; NEReduceMean::NEReduceMean(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _reduction_ops(), + _keep_dims() { } -Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status NEReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -107,7 +118,8 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); _reduction_ops = reduction_axis.num_dimensions(); @@ -124,37 +136,40 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), + tmp_output->info()->data_type(), + tmp_output->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!keep_dims) + if (!keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); } @@ -166,11 +181,11 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, void NEReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp index 9660347a16..8540d750fc 100644 --- a/src/runtime/NEON/functions/NEReductionOperation.cpp +++ b/src/runtime/NEON/functions/NEReductionOperation.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { @@ -42,7 +43,7 @@ namespace */ size_t reduction_window_split_dimension(unsigned int axis) { - switch(axis) + switch (axis) { case 0: return Window::DimY; @@ -59,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis) NEReductionOperation::~NEReductionOperation() = default; NEReductionOperation::NEReductionOperation(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false) + : _memory_group(memory_manager), + _reduction_kernel(), + _reshape(), + _output_internal(), + _window_split(0), + _reduction_axis(), + _is_reshape_required(false) { } -Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status NEReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const auto is_reshape_required = !keep_dims; @@ -74,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf TensorInfo info_before_reshape; - if(is_reshape_required) + if (is_reshape_required) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); auto shape_before_reshape = input->tensor_shape(); @@ -84,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); + const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); + const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); - info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo); + info_before_reshape.set_data_type(output_data_type) + .set_tensor_shape(shape_before_reshape) + .set_num_channels(input_num_channles) + .set_quantization_info(input_qinfo); output_internal = &info_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output)); } @@ -102,7 +115,8 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void NEReductionOperation::configure( + ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); @@ -112,19 +126,32 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i auto *output_internal = output; const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - if(_is_reshape_required) + if (_is_reshape_required) { - const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); - const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - const auto num_channels = input->info()->num_channels(); - const auto qinfo = input->info()->quantization_info(); - - _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels( - num_channels).set_quantization_info(qinfo)); + const auto output_internal_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const auto output_external_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); + const auto num_channels = input->info()->num_channels(); + const auto qinfo = input->info()->quantization_info(); + + _output_internal.allocator()->init(input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_internal_shape) + .reset_padding() + .set_is_resizable(true) + .set_num_channels(num_channels) + .set_quantization_info(qinfo)); _memory_group.manage(&_output_internal); output_internal = &_output_internal; - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_external_shape) + .reset_padding() + .set_is_resizable(true)); } ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims)); @@ -135,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i _window_split = reduction_window_split_dimension(axis); _reduction_axis = axis; - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(output_internal, output); _output_internal.allocator()->allocate(); @@ -146,7 +173,7 @@ void NEReductionOperation::run() { MemoryGroupResourceScope scope_mg(_memory_group); NEScheduler::get().schedule(_reduction_kernel.get(), _window_split); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NEReorderLayer.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp index 427bf8c501..89cf575f38 100644 --- a/src/runtime/NEON/functions/NEReorderLayer.cpp +++ b/src/runtime/NEON/functions/NEReorderLayer.cpp @@ -23,20 +23,24 @@ */ #if defined(__aarch64__) -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEReorderLayer.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/kernels/NEReorderKernel.h" namespace arm_compute { NEReorderLayer::~NEReorderLayer() = default; -NEReorderLayer::NEReorderLayer() - : _reorder_kernel(std::make_unique()) +NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique()) { } -void NEReorderLayer::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +void NEReorderLayer::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { auto k = std::make_unique(); k->configure(input, output, input_wf, output_wf); @@ -49,11 +53,14 @@ void NEReorderLayer::run() NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX); } -Status NEReorderLayer::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +Status NEReorderLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { return NEReorderKernel::validate(input, output, input_wf, output_wf); } } // namespace arm_compute -#endif // defined(__aarch64__) \ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp index 8ee73d7390..14e41d6df4 100644 --- a/src/runtime/NEON/functions/NEReorgLayer.cpp +++ b/src/runtime/NEON/functions/NEReorgLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h" -#include "src/core/NEON/kernels/NEReorgLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReorgLayerKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp index 3ccb42361e..bed70ff66c 100644 --- a/src/runtime/NEON/functions/NEReshapeLayer.cpp +++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuReshape.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NEReshapeLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEReshapeLayer::NEReshapeLayer() - : _impl(std::make_unique()) +NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique()) { } -NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; +NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default; NEReshapeLayer::~NEReshapeLayer() = default; diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp index e1988f2ab3..a90f8d2e76 100644 --- a/src/runtime/NEON/functions/NEReverse.cpp +++ b/src/runtime/NEON/functions/NEReverse.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEReverse.h" -#include "src/core/NEON/kernels/NEReverseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReverseKernel.h" namespace arm_compute { @@ -38,7 +37,10 @@ void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor * _kernel = std::move(k); } -Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status NEReverse::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { return NEReverseKernel::validate(input, output, axis, use_inverted_axis); } diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index 09f037334e..0d011064f6 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEScale.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/utils/ScaleUtils.h" #include "src/cpu/operators/CpuScale.h" @@ -32,16 +33,16 @@ namespace arm_compute { struct NEScale::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor dx{ nullptr }; /**< Element's distance between the X real coordinate and the smallest X following integer */ - Tensor dy{ nullptr }; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ - Tensor offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */ + Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + Tensor offsets{ + nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ + std::unique_ptr op{nullptr}; }; -NEScale::NEScale() - : _impl(std::make_unique()) +NEScale::NEScale() : _impl(std::make_unique()) { } NEScale::~NEScale() = default; @@ -57,25 +58,33 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & // Configure for size of allocation of internal tensors // Get data layout and width/height indices - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const DataLayout data_layout = + info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; // Get the tensor shape TensorShape shape(output->info()->dimension(idx_width)); shape.set(1, output->info()->dimension(idx_height), false); - bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(data_layout, input->info()->data_type(), policy_to_use, info.border_mode); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + data_layout, input->info()->data_type(), policy_to_use, info.border_mode); - if(precompute_indices_weights) + if (precompute_indices_weights) { const TensorInfo tensor_info_dxdy(shape, Format::F32); const TensorInfo tensor_info_offsets(shape, Format::S32); @@ -83,7 +92,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & _impl->dx.allocator()->init(tensor_info_dxdy); _impl->dy.allocator()->init(tensor_info_dxdy); _impl->offsets.allocator()->init(tensor_info_offsets); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -109,7 +118,8 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & } else { - if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && + policy_to_use != InterpolationPolicy::AREA) { ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp index 26c2eb8fe9..55cad2202b 100644 --- a/src/runtime/NEON/functions/NESelect.cpp +++ b/src/runtime/NEON/functions/NESelect.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NESelect.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESelectKernel.h" diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp index 4a8912bfe9..12d43adc84 100644 --- a/src/runtime/NEON/functions/NESlice.cpp +++ b/src/runtime/NEON/functions/NESlice.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" @@ -34,7 +35,10 @@ namespace arm_compute { namespace experimental { -void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void NESlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); @@ -47,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo _kernel = std::move(k); } -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -66,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct NESlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NESlice::NESlice() - : _impl(std::make_unique()) +NESlice::NESlice() : _impl(std::make_unique()) { } -NESlice::NESlice(NESlice &&) = default; +NESlice::NESlice(NESlice &&) = default; NESlice &NESlice::operator=(NESlice &&) = default; NESlice::~NESlice() = default; -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::NESlice::validate(input, output, starts, ends); } diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 0947ff94a6..e3c2012d05 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" #include "src/cpu/kernels/CpuSoftmaxKernel.h" @@ -35,10 +37,10 @@ namespace arm_compute template struct NESoftmaxLayerGeneric::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor max{ nullptr }; - std::unique_ptr> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor max{nullptr}; + std::unique_ptr> op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -53,9 +55,9 @@ NESoftmaxLayerGeneric::NESoftmaxLayerGeneric(std::shared_ptr NESoftmaxLayerGeneric::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default; -template +template NESoftmaxLayerGeneric &NESoftmaxLayerGeneric::operator=(NESoftmaxLayerGeneric &&) = default; -template +template NESoftmaxLayerGeneric::~NESoftmaxLayerGeneric() = default; template @@ -68,12 +70,13 @@ void NESoftmaxLayerGeneric::configure(ITensor *input, ITensor *output, f _impl->op = std::make_unique>(); _impl->op->configure(input->info(), output->info(), beta, axis); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template -Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis)); @@ -81,7 +84,7 @@ Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const I } template -void NESoftmaxLayerGeneric::run() +void NESoftmaxLayerGeneric::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp index c4509510dc..556ebdd800 100644 --- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h" @@ -37,17 +38,19 @@ namespace arm_compute { NESpaceToBatchLayer::~NESpaceToBatchLayer() = default; -NESpaceToBatchLayer::NESpaceToBatchLayer() - : _space_to_batch_kernel(), _fill_f(), _has_padding(false) +NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false) { } -void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique(); @@ -57,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s _space_to_batch_kernel->configure(input, block_shape, paddings, output); } -void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique(); @@ -71,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_ _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output); } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -89,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void NESpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { _fill_f->run(); } diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp index b37bf0d20f..846b619429 100644 --- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h" @@ -36,8 +37,7 @@ namespace arm_compute { NESpaceToDepthLayer::~NESpaceToDepthLayer() = default; -NESpaceToDepthLayer::NESpaceToDepthLayer() - : _space_to_depth_kernel() +NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel() { } diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp index db19bbb824..53b09e9ae5 100644 --- a/src/runtime/NEON/functions/NESplit.cpp +++ b/src/runtime/NEON/functions/NESplit.cpp @@ -34,7 +34,7 @@ namespace arm_compute { void NESplit::run() { - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp index 68554e0931..03e7026691 100644 --- a/src/runtime/NEON/functions/NEStackLayer.cpp +++ b/src/runtime/NEON/functions/NEStackLayer.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStackLayerKernel.h" @@ -38,9 +39,7 @@ namespace arm_compute NEStackLayer::~NEStackLayer() = default; NEStackLayer::NEStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } @@ -54,7 +53,7 @@ void NEStackLayer::configure(const std::vector &input, int axis, ITen // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { _stack_kernels[i] = std::make_unique(); _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output); @@ -72,7 +71,7 @@ Status NEStackLayer::validate(const std::vector &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -85,7 +84,7 @@ Status NEStackLayer::validate(const std::vector &input, int axis, void NEStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY); } diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp index 4f50749a4f..6a3ac8be05 100644 --- a/src/runtime/NEON/functions/NEStridedSlice.cpp +++ b/src/runtime/NEON/functions/NEStridedSlice.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" @@ -32,9 +33,14 @@ namespace arm_compute { namespace experimental { -void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); @@ -43,9 +49,14 @@ void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output, _kernel = std::move(k); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -53,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct NEStridedSlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEStridedSlice::NEStridedSlice() - : _impl(std::make_unique()) +NEStridedSlice::NEStridedSlice() : _impl(std::make_unique()) { } -NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; +NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default; NEStridedSlice::~NEStridedSlice() = default; -void NEStridedSlice::configure(const ITensor *input, ITensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensor *input, + ITensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { _impl->src = input; _impl->dst = output; @@ -84,10 +99,16 @@ void NEStridedSlice::run() _impl->op->run(pack); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp index 526603f1a3..d10b1c8e95 100644 --- a/src/runtime/NEON/functions/NETile.cpp +++ b/src/runtime/NEON/functions/NETile.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NETile.h" -#include "src/core/NEON/kernels/NETileKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NETileKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp index 78c7ea202a..0144a85e8c 100644 --- a/src/runtime/NEON/functions/NETranspose.cpp +++ b/src/runtime/NEON/functions/NETranspose.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuTranspose.h" @@ -31,13 +32,12 @@ namespace arm_compute { struct NETranspose::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NETranspose::NETranspose() - : _impl(std::make_unique()) +NETranspose::NETranspose() : _impl(std::make_unique()) { } diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp index 0ffab5e92a..2f7ed2bb1f 100644 --- a/src/runtime/NEON/functions/NEUnstack.cpp +++ b/src/runtime/NEON/functions/NEUnstack.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -39,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -55,19 +58,19 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace NEUnstack::NEUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } void NEUnstack::configure(const ITensor *input, const std::vector &output_vector, int axis) { std::vector outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ITensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -81,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector &ou Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, + slice_end_mask, (1 << axis_u)); } } @@ -102,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vectortensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void NEUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index a8eded29ff..8d77abcfc7 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -26,15 +26,15 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/convolution/common/utils.hpp" #include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuWinogradConv2d.h" -#include "src/core/NEON/kernels/convolution/common/utils.hpp" - namespace arm_compute { using namespace arm_compute::experimental; @@ -42,14 +42,14 @@ using namespace arm_compute::experimental; struct NEWinogradConvolutionLayer::Impl { MemoryGroup memory_group{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - const ITensor *original_weights{ nullptr }; - bool is_prepared{ false }; - bool is_activationlayer_enabled{ false }; + const ITensor *original_weights{nullptr}; + bool is_prepared{false}; + bool is_activationlayer_enabled{false}; DataLayout data_layout{}; }; @@ -61,17 +61,24 @@ NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptroriginal_weights = weights; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + conv_info, act_info, enable_fast_math); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } void NEWinogradConvolutionLayer::run() @@ -82,15 +89,20 @@ void NEWinogradConvolutionLayer::run() _impl->op->run(_impl->run_pack); } -Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } void NEWinogradConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); _impl->original_weights->mark_as_unused(); diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index b0a553212a..d4d6193fce 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" + #include namespace arm_compute @@ -63,7 +64,7 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, _num_threads); - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; info.cpu_info = &cpu_info(); @@ -73,10 +74,10 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win { const unsigned int num_windows = num_threads; std::vector workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) + for (unsigned int t = 0; t < num_windows; t++) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); @@ -92,7 +93,7 @@ void OMPScheduler::run_workloads(std::vector const unsigned int amount_of_work = static_cast(workloads.size()); const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work); - if(num_threads_to_use < 1) + if (num_threads_to_use < 1) { return; } @@ -100,8 +101,9 @@ void OMPScheduler::run_workloads(std::vector ThreadInfo info; info.cpu_info = &cpu_info(); info.num_threads = num_threads_to_use; - #pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) schedule(static, 1) - for(unsigned int wid = 0; wid < amount_of_work; ++wid) +#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \ + schedule(static, 1) + for (unsigned int wid = 0; wid < amount_of_work; ++wid) { const int tid = omp_get_thread_num(); diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp index a47fa184fa..d746f618b5 100644 --- a/src/runtime/OffsetLifetimeManager.cpp +++ b/src/runtime/OffsetLifetimeManager.cpp @@ -43,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment) return (remainder != 0U) ? offset + (alignment - remainder) : offset; } } // namespace -OffsetLifetimeManager::OffsetLifetimeManager() - : _blob(0) +OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0) { } @@ -71,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings() // Update blob size size_t max_aggregated_size = 0; - std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b) - { - max_aggregated_size += b.max_size; - _blob.alignment = std::max(_blob.alignment, b.max_alignment); - }); + std::for_each(std::begin(_free_blobs), std::end(_free_blobs), + [&](const Blob &b) + { + max_aggregated_size += b.max_size; + _blob.alignment = std::max(_blob.alignment, b.max_alignment); + }); max_aggregated_size += _free_blobs.size() * _blob.alignment; _blob.owners = std::max(_blob.owners, _free_blobs.size()); _blob.size = std::max(_blob.size, max_aggregated_size); // Calculate group mappings - auto &group_mappings = _active_group->mappings(); + auto &group_mappings = _active_group->mappings(); size_t offset = 0; - for(auto &free_blob : _free_blobs) + for (auto &free_blob : _free_blobs) { - for(auto &bound_element_id : free_blob.bound_elements) + for (auto &bound_element_id : free_blob.bound_elements) { ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements)); Element &bound_element = _active_elements[bound_element_id]; diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp index ffedf5586c..8f3c1a84ba 100644 --- a/src/runtime/OffsetMemoryPool.cpp +++ b/src/runtime/OffsetMemoryPool.cpp @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include - #include "arm_compute/runtime/OffsetMemoryPool.h" #include "arm_compute/core/Error.h" @@ -31,6 +29,8 @@ #include "arm_compute/runtime/MemoryRegion.h" #include "arm_compute/runtime/Types.h" +#include + namespace arm_compute { OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info) @@ -50,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) ARM_COMPUTE_ERROR_ON(_blob == nullptr); // Set memory to handlers - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second)); @@ -59,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles) void OffsetMemoryPool::release(MemoryMappings &handles) { - for(auto &handle : handles) + for (auto &handle : handles) { ARM_COMPUTE_ERROR_ON(handle.first == nullptr); handle.first->set_region(nullptr); diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp index a8ad53da90..19415b35cf 100644 --- a/src/runtime/OperatorTensor.cpp +++ b/src/runtime/OperatorTensor.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/OperatorTensor.h" + #include "arm_compute/runtime/MemoryRegion.h" #include "support/Cast.h" @@ -47,7 +48,7 @@ ITensorInfo *OperatorTensor::info() uint8_t *OperatorTensor::buffer() const { - switch(_mem_type) + switch (_mem_type) { case MemoryType::CPU: return (uint8_t *)utils::cast::polymorphic_downcast(_memory->region())->buffer(); diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp index 87376a71a4..7fb9bd8000 100644 --- a/src/runtime/PoolManager.cpp +++ b/src/runtime/PoolManager.cpp @@ -31,8 +31,7 @@ using namespace arm_compute; -PoolManager::PoolManager() - : _free_pools(), _occupied_pools(), _sem(), _mtx() +PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx() { } @@ -52,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool) ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!"); arm_compute::lock_guard lock(_mtx); - auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr &pool_it) - { - return pool_it.get() == pool; - }); + auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), + [pool](const std::unique_ptr &pool_it) { return pool_it.get() == pool; }); ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!"); _free_pools.splice(std::begin(_free_pools), _occupied_pools, it); _sem->signal(); @@ -78,7 +75,7 @@ std::unique_ptr PoolManager::release_pool() arm_compute::lock_guard lock(_mtx); ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!"); - if(!_free_pools.empty()) + if (!_free_pools.empty()) { std::unique_ptr pool = std::move(_free_pools.front()); ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr); diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp index d1dea066e7..1de8d2abdb 100644 --- a/src/runtime/RuntimeContext.cpp +++ b/src/runtime/RuntimeContext.cpp @@ -28,8 +28,7 @@ namespace arm_compute { -RuntimeContext::RuntimeContext() - : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()) +RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()) { } diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp index 0713b9a2ad..e52fb59940 100644 --- a/src/runtime/Scheduler.cpp +++ b/src/runtime/Scheduler.cpp @@ -76,7 +76,7 @@ void Scheduler::set(Type t) bool Scheduler::is_available(Type t) { - if(t == Type::CUSTOM) + if (t == Type::CUSTOM) { return _custom_scheduler != nullptr; } @@ -93,11 +93,12 @@ Scheduler::Type Scheduler::get_type() IScheduler &Scheduler::get() { - if(_scheduler_type == Type::CUSTOM) + if (_scheduler_type == Type::CUSTOM) { - if(_custom_scheduler == nullptr) + if (_custom_scheduler == nullptr) { - ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr &scheduler) before Scheduler::get()"); + ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr &scheduler) " + "before Scheduler::get()"); } else { @@ -106,13 +107,13 @@ IScheduler &Scheduler::get() } else { - if(_schedulers.empty()) + if (_schedulers.empty()) { _schedulers = init(); } auto it = _schedulers.find(_scheduler_type); - if(it != _schedulers.end()) + if (it != _schedulers.end()) { return *it->second; } diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp index cc21d62630..4fb08d79f5 100644 --- a/src/runtime/SchedulerFactory.cpp +++ b/src/runtime/SchedulerFactory.cpp @@ -48,7 +48,7 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory: std::unique_ptr SchedulerFactory::create(Type type) { - switch(type) + switch (type) { case Type::ST: { diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp index 6f9a32c879..74ee539fec 100644 --- a/src/runtime/SchedulerUtils.cpp +++ b/src/runtime/SchedulerUtils.cpp @@ -47,35 +47,34 @@ std::pair split_2d(unsigned max_threads, std::size_t m, std: double ratio = m / static_cast(n); // nt = sqrt(max_threads * (m / n) ) - const unsigned adjusted = std::round( - std::sqrt(max_threads * ratio)); + const unsigned adjusted = std::round(std::sqrt(max_threads * ratio)); //find the nearest factor of max_threads - for(unsigned i = 0; i != adjusted; ++i) + for (unsigned i = 0; i != adjusted; ++i) { //try down const unsigned adj_down = adjusted - i; - if(max_threads % adj_down == 0) + if (max_threads % adj_down == 0) { - return { adj_down, max_threads / adj_down }; + return {adj_down, max_threads / adj_down}; } //try up const unsigned adj_up = adjusted + i; - if(max_threads % adj_up == 0) + if (max_threads % adj_up == 0) { - return { adj_up, max_threads / adj_up }; + return {adj_up, max_threads / adj_up}; } } //we didn't find anything so lets bail out with maxes biased to the largest dimension - if(m > n) + if (m > n) { - return { std::min(m, max_threads), 1 }; + return {std::min(m, max_threads), 1}; } else { - return { 1, std::min(n, max_threads) }; + return {1, std::min(n, max_threads)}; } } #endif /* #ifndef BARE_METAL */ diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp index ae16c8be0a..f87256abb1 100644 --- a/src/runtime/SubTensor.cpp +++ b/src/runtime/SubTensor.cpp @@ -27,8 +27,7 @@ using namespace arm_compute; -SubTensor::SubTensor() - : _parent(nullptr), _info() +SubTensor::SubTensor() : _parent(nullptr), _info() { } diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp index 6dcef9f0b5..f17e323694 100644 --- a/src/runtime/Tensor.cpp +++ b/src/runtime/Tensor.cpp @@ -25,8 +25,7 @@ namespace arm_compute { -Tensor::Tensor(IRuntimeContext *) - : _allocator(this) +Tensor::Tensor(IRuntimeContext *) : _allocator(this) { } diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp index 4ae27c59fc..372852bfea 100644 --- a/src/runtime/TensorAllocator.cpp +++ b/src/runtime/TensorAllocator.cpp @@ -43,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c const size_t parent_dims = parent_info.num_dimensions(); const size_t child_dims = child_info.num_dimensions(); - if(child_dims <= parent_dims) + if (child_dims <= parent_dims) { - for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) + for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions) { const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1]; - if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) + if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1])) { is_valid = false; break; @@ -65,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c } } // namespace -TensorAllocator::TensorAllocator(IMemoryManageable *owner) - : _owner(owner), _associated_memory_group(nullptr), _memory() +TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory() { } @@ -88,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept { - if(&o != this) + if (&o != this) { _owner = o._owner; o._owner = nullptr; @@ -117,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates & _memory = Memory(allocator._memory.region()); // Init tensor info with new dimensions - size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); - sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size); + size_t total_size = + parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes(); + sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), + parent_info.offset_element_in_bytes(coords), total_size); // Set TensorInfo init(sub_info); @@ -133,7 +134,7 @@ void TensorAllocator::allocate() { // Align to 64-byte boundaries by default if alignment is not specified const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64; - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { _memory.set_owned_region(std::make_unique(info().total_size(), alignment_to_use)); } diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp index 15e9d43a49..a7f7b5f3cb 100644 --- a/src/runtime/Utils.cpp +++ b/src/runtime/Utils.cpp @@ -41,20 +41,17 @@ static const std::string information = const std::string &string_from_scheduler_type(Scheduler::Type t) { - static std::map scheduler_type_map = - { - { Scheduler::Type::ST, "Single Thread" }, - { Scheduler::Type::CPP, "C++11 Threads" }, - { Scheduler::Type::OMP, "OpenMP Threads" }, - { Scheduler::Type::CUSTOM, "Custom" } - }; + static std::map scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"}, + {Scheduler::Type::CPP, "C++11 Threads"}, + {Scheduler::Type::OMP, "OpenMP Threads"}, + {Scheduler::Type::CUSTOM, "Custom"}}; return scheduler_type_map[t]; } void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints) { - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr); ctx->scheduler()->schedule(kernel, hints); @@ -68,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis) { // We need only 1 stage for all axis except x-axis - if(axis != 0) + if (axis != 0) { return 1; } diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp index 1bfb8124e9..aba32871d0 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include namespace arm_compute @@ -37,25 +38,27 @@ namespace cl_direct_conv { using namespace arm_compute::misc::shape_calculator; -ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) - : IClDirectConvKernelConfig(gpu) +ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) { } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { - using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info); + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - ClDirectConvConfigArray configs_G71(&ClDirectConvDefaultConfigBifrost::configure_G71_f32, - &ClDirectConvDefaultConfigBifrost::configure_G71_f16, - &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + ClDirectConvConfigArray configs_G71( + &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16, + &ClDirectConvDefaultConfigBifrost::configure_G71_u8); - ClDirectConvConfigArray configs_default(&ClDirectConvDefaultConfigBifrost::configure_default_f32, - &ClDirectConvDefaultConfigBifrost::configure_default_f16, - &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + ClDirectConvConfigArray configs_default( + &ClDirectConvDefaultConfigBifrost::configure_default_f32, + &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G71: func = configs_G71.get_function(src->data_type()); @@ -69,18 +72,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const IT return (this->*func)(src, wei, conv_info); } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 2; } @@ -93,18 +98,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -117,18 +124,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -141,18 +150,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(c return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 2; } @@ -165,18 +176,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_ return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -188,5 +201,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_ return desc; } -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h index 6b60b2c007..ed6a4c3c68 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h @@ -41,15 +41,21 @@ public: ClDirectConvDefaultConfigBifrost(GPUTarget gpu); // Inherited overridden method - DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; private: - DirectConvComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp index 8f2fd82412..4b7666d5aa 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include namespace arm_compute @@ -37,25 +38,27 @@ namespace cl_direct_conv { using namespace arm_compute::misc::shape_calculator; -ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) - : IClDirectConvKernelConfig(gpu) +ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) { } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { - using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info); + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - ClDirectConvConfigArray configs_G78(&ClDirectConvDefaultConfigValhall::configure_G78_f32, - &ClDirectConvDefaultConfigValhall::configure_G78_f16, - &ClDirectConvDefaultConfigValhall::configure_G78_u8); + ClDirectConvConfigArray configs_G78( + &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); - ClDirectConvConfigArray configs_G57(&ClDirectConvDefaultConfigValhall::configure_G57_f32, - &ClDirectConvDefaultConfigValhall::configure_G57_f16, - &ClDirectConvDefaultConfigValhall::configure_G78_u8); + ClDirectConvConfigArray configs_G57( + &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G57: func = configs_G57.get_function(src->data_type()); @@ -70,15 +73,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const IT return (this->*func)(src, wei, conv_info); } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t ofm = dst_shape[0]; @@ -87,11 +92,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { - if(is_pointwise) + if (is_pointwise) { - if(ofm == 4) + if (ofm == 4) { desc.m0 = 1; desc.n0 = 4; @@ -113,7 +118,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32( } else { - if(m < 64) + if (m < 64) { desc.m0 = 1; desc.n0 = 1; @@ -131,15 +136,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t ofm = dst_shape[0]; @@ -149,15 +156,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { // k0 should be as larger as possible. However, we should avoid // having left-over for loops that make the implementation slower. - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -166,9 +173,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( desc.k0 = 4; } - if(is_pointwise) + if (is_pointwise) { - if(ofm == 4) + if (ofm == 4) { desc.m0 = 1; desc.n0 = 4; @@ -187,15 +194,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( } else { - if(m < 64) + if (m < 64) { desc.m0 = 1; desc.n0 = 1; - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -206,9 +213,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( } else { - if(ofm >= 16) + if (ofm >= 16) { - if(m / 6 > 24000) + if (m / 6 > 24000) { desc.m0 = 6; } @@ -223,11 +230,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( { desc.m0 = 2; desc.n0 = 8; - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -243,18 +250,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); desc.n0 = 4; - if(output_shape[0] > 16) + if (output_shape[0] > 16) { desc.m0 = 4; } @@ -267,15 +276,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(c return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t m = dst_shape[1] * dst_shape[2]; @@ -283,9 +294,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { - if(is_pointwise) + if (is_pointwise) { desc.m0 = 1; desc.n0 = 1; @@ -300,9 +311,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32( } else { - if(m < 64) + if (m < 64) { - if(m == 1) + if (m == 1) { desc.m0 = 1; desc.n0 = 1; @@ -327,15 +338,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32( return desc; } -DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Get the output shape - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); const int32_t ofm = dst_shape[0]; @@ -344,9 +357,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(dst_shape[0] <= 4) + if (dst_shape[0] <= 4) { - if(is_pointwise) + if (is_pointwise) { desc.m0 = 2; desc.n0 = 1; @@ -361,9 +374,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( } else { - if(m < 64) + if (m < 64) { - if(m == 1) + if (m == 1) { desc.m0 = 1; desc.n0 = 1; @@ -378,7 +391,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( } else { - if(ofm > 16) + if (ofm > 16) { desc.m0 = 4; desc.n0 = 8; @@ -396,5 +409,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16( return desc; } -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h index f9d5c5299e..efd879a567 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h @@ -41,15 +41,21 @@ public: ClDirectConvDefaultConfigValhall(GPUTarget gpu); // Inherited overridden method - DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; private: - DirectConvComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h index 232167fc59..2c2509f70b 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h +++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h @@ -46,7 +46,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique(GPUTarget::G71); @@ -59,6 +59,6 @@ public: } } }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h index 6104d73594..e5b270c720 100644 --- a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h +++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h @@ -27,6 +27,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -52,8 +53,7 @@ public: * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -65,7 +65,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -92,8 +92,7 @@ public: * * @param[in] arch GPU target */ - IClDirectConvKernelConfig(GPUTarget arch) - : _target(arch) + IClDirectConvKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig); @@ -105,11 +104,12 @@ public: * @param[in] wei Weights tensor * @param[in] conv_info Convolution info */ - virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; protected: GPUTarget _target; }; -} // namespace opencl +} // namespace cl_direct_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp index 5311fdcec3..98ebf3ebbe 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" -#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/GPUTarget.h" @@ -30,28 +29,34 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + namespace arm_compute { namespace cl_dwc { namespace { -DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier, bool is_g71) +DWCComputeKernelInfo configure_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape wei_shape = wei->tensor_shape(); const size_t kernel_c = wei_shape[idx_c]; const size_t kernel_w = wei_shape[idx_w]; desc.export_input_to_cl_image = false; - if(is_g71) + if (is_g71) { desc.export_weights_to_cl_image = false; } @@ -60,17 +65,17 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); } - if(depth_multiplier == 1) + if (depth_multiplier == 1) { desc.n0 = 4; } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -81,14 +86,15 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } @@ -106,16 +112,20 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we return desc; } -DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier, bool is_g71) +DWCComputeKernelInfo configure_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Src and weights have the same dimension indices - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape src_shape = src->tensor_shape(); const TensorShape wei_shape = wei->tensor_shape(); const size_t src_w = src_shape[idx_w]; @@ -124,7 +134,7 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we desc.export_input_to_cl_image = false; - if(is_g71) + if (is_g71) { desc.export_weights_to_cl_image = false; } @@ -133,9 +143,9 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); } - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - if(desc.export_weights_to_cl_image == false) + if (desc.export_weights_to_cl_image == false) { desc.n0 = 8; } @@ -146,11 +156,11 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -161,20 +171,21 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } else { - if((src_w % 5) == 0) + if ((src_w % 5) == 0) { desc.m0 = 5; } @@ -194,27 +205,30 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we } } // namespace -ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) - : IClDWCNativeKernelConfig(gpu) +ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) { } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { - using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); - ClDWCNativeConfigArray configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32, - &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, - &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + ClDWCNativeConfigArray configs_G71( + &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); - ClDWCNativeConfigArray configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, - &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, - &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + ClDWCNativeConfigArray configs_G7x( + &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G71: func = configs_G71.get_function(src->data_type()); @@ -228,43 +242,58 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInf return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { ARM_COMPUTE_UNUSED(wei); DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = false; desc.n0 = (depth_multiplier == 1) ? 4 : 1; - if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) { desc.m0 = 2; } diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h index cec2cae5dd..41d86c9c14 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h @@ -41,20 +41,38 @@ public: ClDWCNativeDefaultConfigBifrost(GPUTarget gpu); // Inherited overridden method - DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) override; + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; private: - DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); }; } // namespace cl_dwc } // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp index 51f3787875..ef1bb3858c 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" -#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/GPUTarget.h" @@ -30,31 +29,36 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + namespace arm_compute { namespace cl_dwc { -ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) - : IClDWCNativeKernelConfig(gpu) +ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) { } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { - using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); - ClDWCNativeConfigArray configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32, - &ClDWCNativeDefaultConfigValhall::configure_G78_f16, - &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + ClDWCNativeConfigArray configs_G78( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); - ClDWCNativeConfigArray configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32, - &ClDWCNativeDefaultConfigValhall::configure_G77_f16, - &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + ClDWCNativeConfigArray configs_G77( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G77: func = configs_G77.get_function(src->data_type()); @@ -69,15 +73,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInf return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape wei_shape = wei->tensor_shape(); const size_t kernel_c = wei_shape[idx_c]; const size_t kernel_w = wei_shape[idx_w]; @@ -85,17 +92,17 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); - if(depth_multiplier == 1) + if (depth_multiplier == 1) { desc.n0 = 4; } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -106,14 +113,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } @@ -131,16 +139,19 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT return desc; } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { // Src and weights have the same dimension indices - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape src_shape = src->tensor_shape(); const TensorShape wei_shape = wei->tensor_shape(); const size_t src_w = src_shape[idx_w]; @@ -150,9 +161,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - if(desc.export_weights_to_cl_image == false) + if (desc.export_weights_to_cl_image == false) { desc.n0 = 8; } @@ -163,11 +174,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -178,20 +189,21 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } else { - if((src_w % 5) == 0) + if ((src_w % 5) == 0) { desc.m0 = 5; } @@ -210,19 +222,22 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT return desc; } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { ARM_COMPUTE_UNUSED(wei); DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = false; desc.n0 = (depth_multiplier == 1) ? 4 : 1; - if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) { desc.m0 = 2; } @@ -235,15 +250,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITe return desc; } -DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) { DWCComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); - const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); const TensorShape wei_shape = wei->tensor_shape(); const size_t kernel_c = wei_shape[idx_c]; const size_t kernel_w = wei_shape[idx_w]; @@ -251,9 +269,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT desc.export_input_to_cl_image = false; desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - if(desc.export_weights_to_cl_image == false) + if (desc.export_weights_to_cl_image == false) { desc.n0 = 8; } @@ -264,11 +282,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT } else { - if((depth_multiplier % 4) == 0) + if ((depth_multiplier % 4) == 0) { desc.n0 = 4; } - else if((depth_multiplier % 2) == 0) + else if ((depth_multiplier % 2) == 0) { desc.n0 = 2; } @@ -279,14 +297,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT } // Note: If we reduce n0, export to cl_image must be false - ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); desc.n0 = adjust_vec_size(desc.n0, kernel_c); // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) + if (conv_info.stride().first == 1 && dilation.x() == 1) { - if((kernel_w >= 9) || (kernel_w == 1)) + if ((kernel_w >= 9) || (kernel_w == 1)) { desc.m0 = 1; } diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h index 4d51fa668c..fabce77b54 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h @@ -41,18 +41,33 @@ public: ClDWCNativeDefaultConfigValhall(GPUTarget gpu); // Inherited overridden method - DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) override; + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; private: - DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); - DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); }; } // namespace cl_dwc } // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp index 5593c6de61..c8b006c546 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp @@ -32,7 +32,7 @@ namespace cl_dwc bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier) { // Check whether we can use the cl image with the weights. - if(!export_to_cl_image(weights)) + if (!export_to_cl_image(weights)) { return false; } @@ -45,12 +45,12 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons: // 1- When the kernel size is 1x1 // 2- When the depth multiplier is greater than 1 and not multiple of 4. - if((kernel_w == 1) && (kernel_h == 1)) + if ((kernel_w == 1) && (kernel_h == 1)) { return false; } - if((depth_multiplier > 1) && (depth_multiplier % 4) != 0) + if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0) { return false; } @@ -58,4 +58,4 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul return true; } } // namespace cl_dwc -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h index c08053dcb3..49ce6ff479 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h @@ -46,7 +46,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: // The heuristic for Midgard is the same as the one used for Arm Mali-G71 diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h index b5df132a12..614a6622df 100644 --- a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h +++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h @@ -27,6 +27,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -52,8 +53,7 @@ public: * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -65,7 +65,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -92,8 +92,7 @@ public: * * @param[in] arch GPU target */ - IClDWCNativeKernelConfig(GPUTarget arch) - : _target(arch) + IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig); @@ -107,8 +106,11 @@ public: * @param[in] dilation Kernel dilation * @param[in] depth_multiplier Output feature maps multiplier */ - virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier) = 0; + virtual DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) = 0; protected: GPUTarget _target; diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp index 990f050112..3380d8f1b7 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp @@ -35,17 +35,19 @@ namespace cl_indirect_conv { using namespace arm_compute::misc::shape_calculator; -ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) - : IClIndirectConvKernelConfig(gpu) +ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu) { } -DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { - using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - ClIndirectConvConfigArray configs_G77(&ClIndirectConvDefaultConfigValhall::configure_G77_f32, - &ClIndirectConvDefaultConfigValhall::configure_G77_f16); + ClIndirectConvConfigArray configs_G77( + &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16); // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned @@ -57,22 +59,24 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const return (this->*func)(src, wei, conv_info); } -DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); - const int32_t stride_x = conv_info.stride().first; - const int32_t stride_y = conv_info.stride().second; - const int32_t ofm = dst_shape[0]; - const int32_t m = (dst_shape[1]/ stride_x) * (dst_shape[2] / stride_y); + const int32_t stride_x = conv_info.stride().first; + const int32_t stride_y = conv_info.stride().second; + const int32_t ofm = dst_shape[0]; + const int32_t m = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y); desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(ofm <= 4) + if (ofm <= 4) { desc.m0 = 1; desc.n0 = 2; @@ -82,7 +86,7 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3 { // The 16000 threshold value has been identified as the right // one for using the biggest block size allowed on F32: 5x4x4 - if(m < 16000) + if (m < 16000) { desc.m0 = 4; desc.n0 = 4; @@ -100,31 +104,33 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3 return desc; } -DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) { DirectConvComputeKernelInfo desc; - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { - const TensorShape wei_shape = wei->tensor_shape(); - const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); const bool export_weights_to_cl_image = export_to_cl_image(wei); - const int32_t ofm = dst_shape[0]; - const int32_t m = dst_shape[1] * dst_shape[2]; - const int32_t k = wei_shape[0]; + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; desc.export_weights_to_cl_image = export_weights_to_cl_image; - if(ofm <= 4) + if (ofm <= 4) { // k0 should be as larger as possible. However, we should avoid // having left-over for loops that make the implementation slower. - if((k % 16) == 0) + if ((k % 16) == 0) { desc.k0 = 16; } - else if((k % 8) == 0) + else if ((k % 8) == 0) { desc.k0 = 8; } @@ -140,11 +146,11 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1 { // The 16000 threshold value has been identified as the right // one for using the biggest block size allowed on F16: 8x4 - if(m >= 16000 && k < 4) + if (m >= 16000 && k < 4) { desc.m0 = 8; desc.n0 = 4; - desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4 + desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4 } else { diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h index 68dca91885..bab808c66c 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h @@ -41,11 +41,14 @@ public: ClIndirectConvDefaultConfigValhall(GPUTarget gpu); // Inherited overridden method - DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; private: - DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); - DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; } // namespace cl_indirect_conv } // namespace arm_compute diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h index 73fbb87560..dd614e1f68 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h @@ -45,7 +45,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h index d2f4cde662..d05da18b58 100644 --- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h +++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h @@ -27,6 +27,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -49,8 +50,7 @@ public: * @param[in] func_f16 Function to call for indirect convolution F16 * */ - ClIndirectConvConfigArray(T func_f32, T func_f16) - : _configs{ func_f32, func_f16} + ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16} { } @@ -62,7 +62,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -85,8 +85,7 @@ public: * * @param[in] arch GPU target */ - IClIndirectConvKernelConfig(GPUTarget arch) - : _target(arch) + IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig); @@ -98,7 +97,8 @@ public: * @param[in] wei Weights tensor * @param[in] conv_info Convolution info */ - virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; protected: GPUTarget _target; diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp index 01102b3d60..b3c8d891dc 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp @@ -28,30 +28,33 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" -#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" -#include +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" +#include + namespace arm_compute { namespace cl_matmul { -ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) - : IClMatMulNativeKernelConfig(gpu) +ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu) { } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) +MatMulKernelInfo +ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) { - using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo & info); + using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); - ClMatMulNativeConfigArray configs_G710(&ClMatMulNativeDefaultConfigValhall::configure_G710_f32, - &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, - &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); + ClMatMulNativeConfigArray configs_G710( + &ClMatMulNativeDefaultConfigValhall::configure_G710_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G710: default: @@ -67,7 +70,7 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo const bool is_batched = lhs_shape.num_dimensions() > 2; - if(is_batched == true) + if (is_batched == true) { lhs_shape.collapse_from(2); } @@ -81,103 +84,48 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo return (this->*func)(m, n, k, b, rhs->lock_paddings(), info); } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) { - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 2, 8, 4, 1 }, - { 24, 464, 412, 24, 2, 8, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 16, 1 }, - { 5776, 64, 32, 36, 2, 4, 16, 1 }, - { 1568, 64, 40, 36, 2, 8, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = - { - { 3136, 64, 64, 36, 4, 4, 8, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 0 }, - { 688, 92, 68, 32, 5, 4, 4, 0 }, - { 24, 464, 412, 24, 6, 2, 8, 0 }, - { 112, 184, 144, 28, 6, 4, 4, 0 }, - { 5776, 64, 32, 36, 5, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 4, 8, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = - { - { 3136, 64, 64, 36, 4, 4, 4, 1 }, - { 4096, 48, 32, 36, 2, 2, 16, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 6, 2, 8, 1 }, - { 112, 184, 144, 28, 4, 2, 16, 1 }, - { 5776, 64, 32, 36, 4, 4, 4, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 4, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = - { - { 3136, 64, 64, 36, 5, 4, 4, 0 }, - { 4096, 48, 32, 36, 5, 4, 4, 0 }, - { 688, 92, 68, 32, 5, 4, 4, 0 }, - { 24, 464, 412, 24, 6, 2, 4, 0 }, - { 112, 184, 144, 28, 5, 4, 4, 0 }, - { 5776, 64, 32, 36, 5, 4, 4, 0 }, - { 1568, 64, 40, 36, 5, 4, 4, 0 }, - { 2920, 64, 64, 24, 6, 2, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 2, 8, 4, 1 }, - { 24, 464, 412, 24, 2, 8, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 16, 1 }, - { 5776, 64, 32, 36, 2, 8, 8, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 4, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 4, 4, 4, 0 }, - { 5776, 64, 32, 36, 4, 4, 8, 0 }, - { 1568, 64, 40, 36, 4, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = - { - { 3136, 64, 64, 36, 4, 4, 4, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 2, 2, 16, 1 }, - { 112, 184, 144, 28, 4, 4, 4, 1 }, - { 5776, 64, 32, 36, 4, 4, 4, 1 }, - { 1568, 64, 40, 36, 4, 4, 4, 1 }, - { 2920, 64, 64, 24, 4, 4, 4, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = - { - { 3136, 64, 64, 36, 4, 4, 4, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 2, 8, 0 }, - { 112, 184, 144, 28, 4, 4, 4, 0 }, - { 5776, 64, 32, 36, 4, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 4, 0 } - }; + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1}, + {1568, 64, 40, 36, 2, 8, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; const bool adj_lhs = info.adj_lhs(); const bool adj_rhs = info.adj_rhs(); @@ -185,17 +133,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; - if((adj_lhs == false) && (adj_rhs == false)) + if ((adj_lhs == false) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_nt_nt; configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; } - else if((adj_lhs == false) && (adj_rhs == true)) + else if ((adj_lhs == false) && (adj_rhs == true)) { configs_best_to_use = &configs_mnkb_best_nt_t; configs_fallback_to_use = &configs_mnkb_fallback_nt_t; } - else if((adj_lhs == true) && (adj_rhs == false)) + else if ((adj_lhs == true) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_t_nt; configs_fallback_to_use = &configs_mnkb_fallback_t_nt; @@ -209,108 +157,51 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); - return select_info(desc0, - desc1, - m, n, k, b, DataType::F32, rhs_lock_padding); + return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding); } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) { - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 8, 1 }, - { 688, 92, 68, 32, 4, 4, 16, 1 }, - { 24, 464, 412, 24, 4, 4, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 16, 1 }, - { 5776, 64, 32, 36, 4, 4, 8, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = - { - { 3136, 64, 64, 36, 6, 4, 8, 0 }, - { 4096, 48, 32, 36, 6, 4, 8, 0 }, - { 688, 92, 68, 32, 6, 4, 8, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 0 }, - { 112, 184, 144, 28, 6, 4, 8, 0 }, - { 5776, 64, 32, 36, 6, 4, 8, 0 }, - { 1568, 64, 40, 36, 6, 4, 8, 0 }, - { 2920, 64, 64, 24, 6, 4, 8, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = - { - { 3136, 64, 64, 36, 6, 4, 8, 1 }, - { 4096, 48, 32, 36, 6, 4, 8, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 6, 2, 4, 1 }, - { 112, 184, 144, 28, 4, 2, 16, 1 }, - { 5776, 64, 32, 36, 6, 4, 8, 1 }, - { 1568, 64, 40, 36, 6, 4, 8, 1 }, - { 2920, 64, 64, 24, 6, 4, 8, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = - { - { 3136, 64, 64, 36, 6, 2, 16, 0 }, - { 4096, 48, 32, 36, 5, 4, 8, 0 }, - { 688, 92, 68, 32, 6, 2, 16, 0 }, - { 24, 464, 412, 24, 6, 2, 16, 0 }, - { 112, 184, 144, 28, 6, 2, 16, 0 }, - { 5776, 64, 32, 36, 5, 4, 8, 0 }, - { 1568, 64, 40, 36, 5, 4, 8, 0 }, - { 2920, 64, 64, 24, 6, 2, 16, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 4, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 4, 4, 4, 1 }, - { 112, 184, 144, 28, 4, 4, 4, 1 }, - { 5776, 64, 32, 36, 4, 4, 4, 1 }, - { 1568, 64, 40, 36, 4, 4, 4, 1 }, - { 2920, 64, 64, 24, 4, 4, 4, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 4, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 4, 4, 4, 0 }, - { 5776, 64, 32, 36, 4, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = - { - { 3136, 64, 64, 36, 4, 4, 16, 1 }, - { 4096, 48, 32, 36, 4, 4, 8, 1 }, - { 688, 92, 68, 32, 4, 4, 4, 1 }, - { 24, 464, 412, 24, 4, 2, 8, 1 }, - { 112, 184, 144, 28, 4, 2, 16, 1 }, - { 5776, 64, 32, 36, 4, 4, 16, 1 }, - { 1568, 64, 40, 36, 4, 4, 8, 1 }, - { 2920, 64, 64, 24, 4, 4, 16, 1 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = - { - { 3136, 64, 64, 36, 4, 4, 8, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 0 }, - { 688, 92, 68, 32, 4, 4, 8, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 0 }, - { 112, 184, 144, 28, 4, 4, 8, 0 }, - { 5776, 64, 32, 36, 4, 4, 8, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 4, 8, 0 } - }; + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 16, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0}, {688, 92, 68, 32, 6, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0}, + {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1}, + {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0}, {688, 92, 68, 32, 6, 2, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0}, + {1568, 64, 40, 36, 5, 4, 8, 0}, {2920, 64, 64, 24, 6, 2, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; const bool adj_lhs = info.adj_lhs(); const bool adj_rhs = info.adj_rhs(); @@ -318,17 +209,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; - if((adj_lhs == false) && (adj_rhs == false)) + if ((adj_lhs == false) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_nt_nt; configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; } - else if((adj_lhs == false) && (adj_rhs == true)) + else if ((adj_lhs == false) && (adj_rhs == true)) { configs_best_to_use = &configs_mnkb_best_nt_t; configs_fallback_to_use = &configs_mnkb_fallback_nt_t; } - else if((adj_lhs == true) && (adj_rhs == false)) + else if ((adj_lhs == true) && (adj_rhs == false)) { configs_best_to_use = &configs_mnkb_best_t_nt; configs_fallback_to_use = &configs_mnkb_fallback_t_nt; @@ -342,75 +233,46 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); - return select_info(desc0, - desc1, - m, n, k, b, DataType::F16, rhs_lock_padding); + return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding); } -MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) { ARM_COMPUTE_UNUSED(rhs_lock_padding); - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = - { - { 3136, 64, 64, 36, 6, 4, 4, 0 }, - { 4096, 48, 32, 36, 6, 4, 4, 0 }, - { 688, 92, 68, 32, 2, 8, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 6, 4, 4, 0 }, - { 5776, 64, 32, 36, 6, 4, 4, 0 }, - { 1568, 64, 40, 36, 6, 4, 4, 0 }, - { 2920, 64, 64, 24, 5, 4, 4, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = - { - { 3136, 64, 64, 36, 4, 4, 16, 0 }, - { 4096, 48, 32, 36, 4, 4, 16, 0 }, - { 688, 92, 68, 32, 4, 4, 16, 0 }, - { 24, 464, 412, 24, 6, 2, 16, 0 }, - { 112, 184, 144, 28, 4, 4, 16, 0 }, - { 5776, 64, 32, 36, 4, 4, 16, 0 }, - { 1568, 64, 40, 36, 6, 4, 4, 0 }, - { 2920, 64, 64, 24, 4, 4, 16, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = - { - { 3136, 64, 64, 36, 4, 4, 8, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 0 }, - { 688, 92, 68, 32, 4, 4, 4, 0 }, - { 24, 464, 412, 24, 4, 4, 4, 0 }, - { 112, 184, 144, 28, 4, 4, 8, 0 }, - { 5776, 64, 32, 36, 4, 4, 8, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 4, 8, 0 } - }; - - const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = - { - { 3136, 64, 64, 36, 4, 2, 16, 0 }, - { 4096, 48, 32, 36, 4, 4, 4, 0 }, - { 688, 92, 68, 32, 4, 4, 8, 0 }, - { 24, 464, 412, 24, 4, 2, 16, 0 }, - { 112, 184, 144, 28, 4, 2, 16, 0 }, - { 5776, 64, 32, 36, 4, 4, 4, 0 }, - { 1568, 64, 40, 36, 4, 4, 8, 0 }, - { 2920, 64, 64, 24, 4, 2, 16, 0 } - }; + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0}, {688, 92, 68, 32, 2, 8, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0}, {688, 92, 68, 32, 4, 4, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 2, 16, 0}}; const bool adj_lhs = info.adj_lhs(); const bool adj_rhs = info.adj_rhs(); - if((adj_lhs == false) && (adj_rhs == false)) + if ((adj_lhs == false) && (adj_rhs == false)) { return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b); } - else if((adj_lhs == false) && (adj_rhs == true)) + else if ((adj_lhs == false) && (adj_rhs == true)) { return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b); } - else if((adj_lhs == true) && (adj_rhs == false)) + else if ((adj_lhs == true) && (adj_rhs == false)) { return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b); } @@ -419,5 +281,5 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b); } } -} // namespace opencl +} // namespace cl_matmul } // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h index fe167d18dd..6b39db6a3f 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h @@ -44,10 +44,13 @@ public: MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override; private: - MatMulKernelInfo configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); - MatMulKernelInfo configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); - MatMulKernelInfo configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); }; -} // namespace opencl +} // namespace cl_matmul } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp index 1e06e84d4d..89cad30214 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" + #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include @@ -37,22 +38,32 @@ namespace cl_matmul { MatMulKernelInfo select_info(const MatMulKernelInfo &info0, const MatMulKernelInfo &info1, - unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding) + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding) { - ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, "The fallback MatMul configuration cannot have export_to_cl_image = true"); - ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, "The MatMul configurations must have the same adj_lhs value"); - ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, "The MatMul configurations must have the same adj_rhs value"); + ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, + "The fallback MatMul configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, + "The MatMul configurations must have the same adj_lhs value"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, + "The MatMul configurations must have the same adj_rhs value"); const bool adj_lhs = info0.adj_lhs; const bool adj_rhs = info0.adj_rhs; - TensorInfo lhs_info = !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type); - TensorInfo rhs_info = !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type); + TensorInfo lhs_info = + !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type); + TensorInfo rhs_info = + !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type); TensorInfo dst_info; - if(rhs_lock_padding == false) + if (rhs_lock_padding == false) { - if(bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0))) + if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0))) { return info0; } @@ -67,7 +78,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0, } } -MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b) { size_t min_acc = std::numeric_limits::max(); size_t min_idx = 0; @@ -76,12 +93,13 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh const size_t num_rows = configs.size(); const size_t num_cols = configs[0].size(); - ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS"); + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, + "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS"); ARM_COMPUTE_UNUSED(num_cols); // Find nearest GeMM workload // Note: the workload does not depend on the K dimension - for(size_t y = 0; y < num_rows; ++y) + for (size_t y = 0; y < num_rows; ++y) { size_t mc0 = static_cast(configs[y][0]); size_t nc0 = static_cast(configs[y][1]); @@ -94,7 +112,7 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh acc += (k - kc0) * (k - kc0); acc += (b - bc0) * (b - bc0); acc = std::sqrt(acc); - if(acc < min_acc) + if (acc < min_acc) { min_acc = acc; min_idx = y; diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h index 3881617558..a114fffa68 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h @@ -52,7 +52,12 @@ using MatMulNativeConfigsMatrix = std::vector>; */ MatMulKernelInfo select_info(const MatMulKernelInfo &info0, const MatMulKernelInfo &info1, - unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding); + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding); /** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user * @@ -66,7 +71,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0, * * @return @ref MatMulKernelInfo */ -MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b); } // namespace cl_matmul } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */ diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h index a2dbfc7dd5..b10018a6d2 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h @@ -45,7 +45,7 @@ public: */ static std::unique_ptr create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: @@ -56,6 +56,6 @@ public: } } }; -} // namespace opencl +} // namespace cl_matmul } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */ diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h index 4f548bd01d..b9b091100c 100644 --- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h @@ -28,6 +28,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/MatMulInfo.h" + #include "src/core/common/Macros.h" namespace arm_compute @@ -53,8 +54,7 @@ public: * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -66,7 +66,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -93,8 +93,7 @@ public: * * @param[in] arch GPU target */ - IClMatMulNativeKernelConfig(GPUTarget arch) - : _target(arch) + IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig); -- cgit v1.2.1