aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/runtime
diff options
context:
space:
mode:
Diffstat (limited to 'arm_compute/runtime')
-rw-r--r--arm_compute/runtime/Allocator.h7
-rw-r--r--arm_compute/runtime/Array.h8
-rw-r--r--arm_compute/runtime/BlobLifetimeManager.h3
-rw-r--r--arm_compute/runtime/BlobMemoryPool.h5
-rw-r--r--arm_compute/runtime/CL/CLArray.h11
-rw-r--r--arm_compute/runtime/CL/CLBufferAllocator.h6
-rw-r--r--arm_compute/runtime/CL/CLFunctions.h29
-rw-r--r--arm_compute/runtime/CL/CLMemory.h7
-rw-r--r--arm_compute/runtime/CL/CLMemoryRegion.h16
-rw-r--r--arm_compute/runtime/CL/CLRuntimeContext.h8
-rw-r--r--arm_compute/runtime/CL/CLScheduler.h52
-rw-r--r--arm_compute/runtime/CL/CLSubTensor.h9
-rw-r--r--arm_compute/runtime/CL/CLTensor.h6
-rw-r--r--arm_compute/runtime/CL/CLTensorAllocator.h7
-rw-r--r--arm_compute/runtime/CL/CLTuner.h62
-rw-r--r--arm_compute/runtime/CL/CLTunerTypes.h13
-rw-r--r--arm_compute/runtime/CL/CLTuningParams.h34
-rw-r--r--arm_compute/runtime/CL/CLTypes.h28
-rw-r--r--arm_compute/runtime/CL/ICLGEMMKernelSelection.h6
-rw-r--r--arm_compute/runtime/CL/ICLOperator.h5
-rw-r--r--arm_compute/runtime/CL/ICLTuner.h2
-rw-r--r--arm_compute/runtime/CL/functions/CLActivationLayer.h11
-rw-r--r--arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h20
-rw-r--r--arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h38
-rw-r--r--arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h41
-rw-r--r--arm_compute/runtime/CL/functions/CLBitwiseAnd.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLBitwiseNot.h2
-rw-r--r--arm_compute/runtime/CL/functions/CLBitwiseOr.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLBitwiseXor.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h16
-rw-r--r--arm_compute/runtime/CL/functions/CLCast.h39
-rw-r--r--arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h6
-rw-r--r--arm_compute/runtime/CL/functions/CLComparison.h11
-rw-r--r--arm_compute/runtime/CL/functions/CLConcatenateLayer.h11
-rw-r--r--arm_compute/runtime/CL/functions/CLConv3D.h116
-rw-r--r--arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h27
-rw-r--r--arm_compute/runtime/CL/functions/CLConvolutionLayer.h84
-rw-r--r--arm_compute/runtime/CL/functions/CLCopy.h6
-rw-r--r--arm_compute/runtime/CL/functions/CLCrop.h34
-rw-r--r--arm_compute/runtime/CL/functions/CLCropResize.h29
-rw-r--r--arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h45
-rw-r--r--arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLDepthConvertLayer.h9
-rw-r--r--arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h315
-rw-r--r--arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h25
-rw-r--r--arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h33
-rw-r--r--arm_compute/runtime/CL/functions/CLElementwiseOperations.h119
-rw-r--r--arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLFFT1D.h8
-rw-r--r--arm_compute/runtime/CL/functions/CLFFT2D.h8
-rw-r--r--arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h31
-rw-r--r--arm_compute/runtime/CL/functions/CLFill.h9
-rw-r--r--arm_compute/runtime/CL/functions/CLFillBorder.h67
-rw-r--r--arm_compute/runtime/CL/functions/CLFloor.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h149
-rw-r--r--arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h41
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMM.h31
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h247
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h21
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h83
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h272
-rw-r--r--arm_compute/runtime/CL/functions/CLGather.h17
-rw-r--r--arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h23
-rw-r--r--arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h135
-rw-r--r--arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h24
-rw-r--r--arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLLSTMLayer.h91
-rw-r--r--arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h135
-rw-r--r--arm_compute/runtime/CL/functions/CLLogicalAnd.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLLogicalNot.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLLogicalOr.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLMatMul.h134
-rw-r--r--arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h13
-rw-r--r--arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLNormalizationLayer.h8
-rw-r--r--arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h29
-rw-r--r--arm_compute/runtime/CL/functions/CLPadLayer.h20
-rw-r--r--arm_compute/runtime/CL/functions/CLPermute.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h48
-rw-r--r--arm_compute/runtime/CL/functions/CLPooling3dLayer.h106
-rw-r--r--arm_compute/runtime/CL/functions/CLPoolingLayer.h20
-rw-r--r--arm_compute/runtime/CL/functions/CLPriorBoxLayer.h13
-rw-r--r--arm_compute/runtime/CL/functions/CLQLSTMLayer.h380
-rw-r--r--arm_compute/runtime/CL/functions/CLRNNLayer.h29
-rw-r--r--arm_compute/runtime/CL/functions/CLROIAlignLayer.h14
-rw-r--r--arm_compute/runtime/CL/functions/CLROIPoolingLayer.h14
-rw-r--r--arm_compute/runtime/CL/functions/CLRange.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLReduceMean.h11
-rw-r--r--arm_compute/runtime/CL/functions/CLReductionOperation.h18
-rw-r--r--arm_compute/runtime/CL/functions/CLRemap.h83
-rw-r--r--arm_compute/runtime/CL/functions/CLReshapeLayer.h1
-rw-r--r--arm_compute/runtime/CL/functions/CLReverse.h44
-rw-r--r--arm_compute/runtime/CL/functions/CLScale.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLScatter.h110
-rw-r--r--arm_compute/runtime/CL/functions/CLSelect.h6
-rw-r--r--arm_compute/runtime/CL/functions/CLSlice.h18
-rw-r--r--arm_compute/runtime/CL/functions/CLSoftmaxLayer.h20
-rw-r--r--arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h34
-rw-r--r--arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLSplit.h1
-rw-r--r--arm_compute/runtime/CL/functions/CLStackLayer.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLStridedSlice.h57
-rw-r--r--arm_compute/runtime/CL/functions/CLTile.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLTranspose.h2
-rw-r--r--arm_compute/runtime/CL/functions/CLUnstack.h8
-rw-r--r--arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h31
-rw-r--r--arm_compute/runtime/CL/tuners/CLTuningParametersList.h1
-rw-r--r--arm_compute/runtime/CPP/CPPScheduler.h6
-rw-r--r--arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h25
-rw-r--r--arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h24
-rw-r--r--arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h22
-rw-r--r--arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h18
-rw-r--r--arm_compute/runtime/CPP/functions/CPPPermute.h5
-rw-r--r--arm_compute/runtime/CPP/functions/CPPSplit.h57
-rw-r--r--arm_compute/runtime/CPP/functions/CPPTopKV.h6
-rw-r--r--arm_compute/runtime/CPP/functions/CPPUpsample.h5
-rw-r--r--arm_compute/runtime/FunctionDescriptors.h67
-rw-r--r--arm_compute/runtime/IAllocator.h2
-rw-r--r--arm_compute/runtime/IFunction.h2
-rw-r--r--arm_compute/runtime/IMemoryGroup.h5
-rw-r--r--arm_compute/runtime/IMemoryManager.h2
-rw-r--r--arm_compute/runtime/IMemoryPool.h2
-rw-r--r--arm_compute/runtime/IMemoryRegion.h3
-rw-r--r--arm_compute/runtime/IPoolManager.h2
-rw-r--r--arm_compute/runtime/IScheduler.h27
-rw-r--r--arm_compute/runtime/ISimpleLifetimeManager.h18
-rw-r--r--arm_compute/runtime/ITensorAllocator.h6
-rw-r--r--arm_compute/runtime/ITransformWeights.h8
-rw-r--r--arm_compute/runtime/IWeightsManager.h24
-rw-r--r--arm_compute/runtime/Memory.h5
-rw-r--r--arm_compute/runtime/MemoryGroup.h21
-rw-r--r--arm_compute/runtime/MemoryManagerOnDemand.h12
-rw-r--r--arm_compute/runtime/MemoryRegion.h24
-rw-r--r--arm_compute/runtime/NEON/INEOperator.h6
-rw-r--r--arm_compute/runtime/NEON/INESimpleFunction.h2
-rw-r--r--arm_compute/runtime/NEON/NEFunctions.h26
-rw-r--r--arm_compute/runtime/NEON/NEScheduler.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEActivationLayer.h8
-rw-r--r--arm_compute/runtime/NEON/functions/NEAddMulAdd.h115
-rw-r--r--arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h24
-rw-r--r--arm_compute/runtime/NEON/functions/NEArithmeticAddition.h19
-rw-r--r--arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h18
-rw-r--r--arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h22
-rw-r--r--arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h24
-rw-r--r--arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h8
-rw-r--r--arm_compute/runtime/NEON/functions/NECast.h7
-rw-r--r--arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEConcatenateLayer.h6
-rw-r--r--arm_compute/runtime/NEON/functions/NEConv3D.h100
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h52
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvolutionLayer.h71
-rw-r--r--arm_compute/runtime/NEON/functions/NECopy.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NECropResize.h18
-rw-r--r--arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h74
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h6
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h24
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h69
-rw-r--r--arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h22
-rw-r--r--arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h16
-rw-r--r--arm_compute/runtime/NEON/functions/NEElementwiseOperations.h56
-rw-r--r--arm_compute/runtime/NEON/functions/NEFFT1D.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEFFT2D.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h21
-rw-r--r--arm_compute/runtime/NEON/functions/NEFill.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEFillBorder.h6
-rw-r--r--arm_compute/runtime/NEON/functions/NEFloor.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h117
-rw-r--r--arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h27
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMM.h100
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConv2d.h14
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h351
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h107
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h242
-rw-r--r--arm_compute/runtime/NEON/functions/NEGather.h13
-rw-r--r--arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h13
-rw-r--r--arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h8
-rw-r--r--arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NELSTMLayer.h58
-rw-r--r--arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h95
-rw-r--r--arm_compute/runtime/NEON/functions/NEMatMul.h145
-rw-r--r--arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h17
-rw-r--r--arm_compute/runtime/NEON/functions/NENormalizationLayer.h16
-rw-r--r--arm_compute/runtime/NEON/functions/NEPadLayer.h21
-rw-r--r--arm_compute/runtime/NEON/functions/NEPermute.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h28
-rw-r--r--arm_compute/runtime/NEON/functions/NEPooling3dLayer.h96
-rw-r--r--arm_compute/runtime/NEON/functions/NEPoolingLayer.h9
-rw-r--r--arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h5
-rw-r--r--arm_compute/runtime/NEON/functions/NEQLSTMLayer.h362
-rw-r--r--arm_compute/runtime/NEON/functions/NERNNLayer.h15
-rw-r--r--arm_compute/runtime/NEON/functions/NEROIAlignLayer.h5
-rw-r--r--arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h9
-rw-r--r--arm_compute/runtime/NEON/functions/NERange.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEReduceMean.h15
-rw-r--r--arm_compute/runtime/NEON/functions/NEReductionOperation.h8
-rw-r--r--arm_compute/runtime/NEON/functions/NERemap.h67
-rw-r--r--arm_compute/runtime/NEON/functions/NEReorderLayer.h94
-rw-r--r--arm_compute/runtime/NEON/functions/NEReverse.h40
-rw-r--r--arm_compute/runtime/NEON/functions/NEScale.h12
-rw-r--r--arm_compute/runtime/NEON/functions/NESlice.h6
-rw-r--r--arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h21
-rw-r--r--arm_compute/runtime/NEON/functions/NESplit.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEStackLayer.h13
-rw-r--r--arm_compute/runtime/NEON/functions/NEStridedSlice.h44
-rw-r--r--arm_compute/runtime/NEON/functions/NETile.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NETranspose.h5
-rw-r--r--arm_compute/runtime/NEON/functions/NEUnstack.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h106
-rw-r--r--arm_compute/runtime/OMP/OMPScheduler.h10
-rw-r--r--arm_compute/runtime/OffsetLifetimeManager.h3
-rw-r--r--arm_compute/runtime/OffsetMemoryPool.h5
-rw-r--r--arm_compute/runtime/OperatorList.h78
-rw-r--r--arm_compute/runtime/OperatorTensor.h2
-rw-r--r--arm_compute/runtime/PoolManager.h12
-rw-r--r--arm_compute/runtime/RuntimeContext.h4
-rw-r--r--arm_compute/runtime/Scheduler.h12
-rw-r--r--arm_compute/runtime/SubTensor.h2
-rw-r--r--arm_compute/runtime/Tensor.h2
-rw-r--r--arm_compute/runtime/TensorAllocator.h1
-rw-r--r--arm_compute/runtime/common/LSTMParams.h18
221 files changed, 4462 insertions, 3348 deletions
diff --git a/arm_compute/runtime/Allocator.h b/arm_compute/runtime/Allocator.h
index 83f072ab6b..e99ddb3dac 100644
--- a/arm_compute/runtime/Allocator.h
+++ b/arm_compute/runtime/Allocator.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_ALLOCATOR_H
#include "arm_compute/runtime/IAllocator.h"
-
#include "arm_compute/runtime/IMemoryRegion.h"
#include <cstddef>
@@ -40,9 +39,9 @@ public:
Allocator() = default;
// Inherited methods overridden:
- void *allocate(size_t size, size_t alignment) override;
- void free(void *ptr) override;
+ void *allocate(size_t size, size_t alignment) override;
+ void free(void *ptr) override;
std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_ALLOCATOR_H */
diff --git a/arm_compute/runtime/Array.h b/arm_compute/runtime/Array.h
index 21d9c25c87..9283273317 100644
--- a/arm_compute/runtime/Array.h
+++ b/arm_compute/runtime/Array.h
@@ -37,16 +37,14 @@ class Array : public IArray<T>
{
public:
/** Default constructor: empty array */
- Array()
- : IArray<T>(0), _values(nullptr)
+ Array() : IArray<T>(0), _values(nullptr)
{
}
/** Constructor: initializes an array which can contain up to max_num_points values
*
* @param[in] max_num_values Maximum number of values the array will be able to stored
*/
- Array(size_t max_num_values)
- : IArray<T>(max_num_values), _values(std::make_unique<T[]>(max_num_values))
+ Array(size_t max_num_values) : IArray<T>(max_num_values), _values(std::make_unique<T[]>(max_num_values))
{
}
@@ -72,5 +70,5 @@ using Int16Array = Array<int16_t>;
using Int32Array = Array<int32_t>;
/** Array of floats. */
using FloatArray = Array<float>;
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_ARRAY_H */
diff --git a/arm_compute/runtime/BlobLifetimeManager.h b/arm_compute/runtime/BlobLifetimeManager.h
index 0d69f2e7c5..18ffe96ee5 100644
--- a/arm_compute/runtime/BlobLifetimeManager.h
+++ b/arm_compute/runtime/BlobLifetimeManager.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_BLOBLIFETIMEMANAGER_H
#include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
#include "arm_compute/runtime/Types.h"
#include <memory>
@@ -62,7 +61,7 @@ public:
// Inherited methods overridden:
std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
- MappingType mapping_type() const override;
+ MappingType mapping_type() const override;
private:
// Inherited methods overridden:
diff --git a/arm_compute/runtime/BlobMemoryPool.h b/arm_compute/runtime/BlobMemoryPool.h
index 8481fa20f9..b25efc3821 100644
--- a/arm_compute/runtime/BlobMemoryPool.h
+++ b/arm_compute/runtime/BlobMemoryPool.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_BLOBMEMORYPOOL_H
#include "arm_compute/runtime/IMemoryPool.h"
-
#include "arm_compute/runtime/IMemoryRegion.h"
#include "arm_compute/runtime/Types.h"
@@ -62,8 +61,8 @@ public:
BlobMemoryPool &operator=(BlobMemoryPool &&) = default;
// Inherited methods overridden:
- void acquire(MemoryMappings &handles) override;
- void release(MemoryMappings &handles) override;
+ void acquire(MemoryMappings &handles) override;
+ void release(MemoryMappings &handles) override;
MappingType mapping_type() const override;
std::unique_ptr<IMemoryPool> duplicate() override;
diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
index 7efe208b9f..6e81a46a29 100644
--- a/arm_compute/runtime/CL/CLArray.h
+++ b/arm_compute/runtime/CL/CLArray.h
@@ -38,8 +38,7 @@ class CLArray : public ICLArray<T>
{
public:
/** Default constructor: empty array */
- CLArray()
- : ICLArray<T>(0), _buffer()
+ CLArray() : ICLArray<T>(0), _buffer()
{
}
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -55,7 +54,8 @@ public:
* @param[in] max_num_values Maximum number of values the array will be able to stored
*/
CLArray(size_t max_num_values)
- : ICLArray<T>(max_num_values), _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
+ : ICLArray<T>(max_num_values),
+ _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
{
}
/** Enqueue a map operation of the allocated buffer.
@@ -91,7 +91,8 @@ protected:
uint8_t *do_map(cl::CommandQueue &q, bool blocking) override
{
ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
- return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
+ return static_cast<uint8_t *>(q.enqueueMapBuffer(
+ _buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
}
void do_unmap(cl::CommandQueue &q, uint8_t *mapping) override
{
@@ -114,5 +115,5 @@ using CLInt16Array = CLArray<cl_short>;
using CLInt32Array = CLArray<cl_int>;
/** OpenCL Array of floats. */
using CLFloatArray = CLArray<cl_float>;
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLARRAY_H */
diff --git a/arm_compute/runtime/CL/CLBufferAllocator.h b/arm_compute/runtime/CL/CLBufferAllocator.h
index 7467e9d1c6..00ff017012 100644
--- a/arm_compute/runtime/CL/CLBufferAllocator.h
+++ b/arm_compute/runtime/CL/CLBufferAllocator.h
@@ -35,9 +35,9 @@ class CLBufferAllocator final : public IAllocator
{
public:
// Inherited methods overridden:
- void *allocate(size_t size, size_t alignment) override;
- void free(void *ptr) override;
+ void *allocate(size_t size, size_t alignment) override;
+ void free(void *ptr) override;
std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_CLBUFFERALLOCATOR_H */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 62c94152e8..a09ca551d2 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLFUNCTIONS_H
-#define ARM_COMPUTE_CLFUNCTIONS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
/* Header regrouping all the CL functions */
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
@@ -38,6 +38,7 @@
#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
#include "arm_compute/runtime/CL/functions/CLComparison.h"
#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConv3D.h"
#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLCopy.h"
@@ -57,48 +58,50 @@
#include "arm_compute/runtime/CL/functions/CLFFT2D.h"
#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLFill.h"
-#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
#include "arm_compute/runtime/CL/functions/CLFloor.h"
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h"
+#include "arm_compute/runtime/CL/functions/CLGather.h"
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/CL/functions/CLGather.h"
#include "arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h"
+#include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
-#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
-#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
#include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
+#include "arm_compute/runtime/CL/functions/CLMatMul.h"
#include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
#include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
-#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
#include "arm_compute/runtime/CL/functions/CLPermute.h"
#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h"
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
#include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
#include "arm_compute/runtime/CL/functions/CLQLSTMLayer.h"
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
-#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
-#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
#include "arm_compute/runtime/CL/functions/CLRange.h"
#include "arm_compute/runtime/CL/functions/CLReduceMean.h"
#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
#include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
#include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
#include "arm_compute/runtime/CL/functions/CLScale.h"
+#include "arm_compute/runtime/CL/functions/CLScatter.h"
#include "arm_compute/runtime/CL/functions/CLSelect.h"
#include "arm_compute/runtime/CL/functions/CLSlice.h"
#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
@@ -112,4 +115,4 @@
#include "arm_compute/runtime/CL/functions/CLUnstack.h"
#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
-#endif /* ARM_COMPUTE_CLFUNCTIONS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
diff --git a/arm_compute/runtime/CL/CLMemory.h b/arm_compute/runtime/CL/CLMemory.h
index 7adee66c73..5abe86bd53 100644
--- a/arm_compute/runtime/CL/CLMemory.h
+++ b/arm_compute/runtime/CL/CLMemory.h
@@ -24,10 +24,9 @@
#ifndef ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
#define ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
-#include "arm_compute/runtime/IMemory.h"
-
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/runtime/CL/CLMemoryRegion.h"
+#include "arm_compute/runtime/IMemory.h"
#include <cstddef>
#include <memory>
@@ -75,8 +74,8 @@ public:
// Inherited methods overridden:
IMemoryRegion *region() final;
IMemoryRegion *region() const final;
- void set_region(IMemoryRegion *region) final;
- void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+ void set_region(IMemoryRegion *region) final;
+ void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
private:
ICLMemoryRegion *_region;
diff --git a/arm_compute/runtime/CL/CLMemoryRegion.h b/arm_compute/runtime/CL/CLMemoryRegion.h
index 1fd8fdb79e..365973a9e6 100644
--- a/arm_compute/runtime/CL/CLMemoryRegion.h
+++ b/arm_compute/runtime/CL/CLMemoryRegion.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -85,10 +85,9 @@ public:
std::unique_ptr<IMemoryRegion> extract_subregion(size_t offset, size_t size) override;
protected:
- cl::CommandQueue _queue;
- cl::Context _ctx;
- void *_mapping;
- cl::Buffer _mem;
+ cl::Context _ctx;
+ void *_mapping;
+ cl::Buffer _mem;
};
/** OpenCL buffer memory region implementation */
@@ -106,11 +105,12 @@ public:
* @param[in] buffer Buffer to be used as a memory region
*/
CLBufferMemoryRegion(const cl::Buffer &buffer);
+ virtual ~CLBufferMemoryRegion() override;
// Inherited methods overridden :
void *ptr() final;
void *map(cl::CommandQueue &q, bool blocking) final;
- void unmap(cl::CommandQueue &q) final;
+ void unmap(cl::CommandQueue &q) final;
};
/** OpenCL SVM memory region interface */
@@ -156,7 +156,7 @@ public:
// Inherited methods overridden :
void *map(cl::CommandQueue &q, bool blocking) final;
- void unmap(cl::CommandQueue &q) final;
+ void unmap(cl::CommandQueue &q) final;
};
/** OpenCL fine-grain SVM memory region implementation */
@@ -173,7 +173,7 @@ public:
// Inherited methods overridden :
void *map(cl::CommandQueue &q, bool blocking) final;
- void unmap(cl::CommandQueue &q) final;
+ void unmap(cl::CommandQueue &q) final;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_RUNTIME_CL_CL_MEMORY_REGION_H */
diff --git a/arm_compute/runtime/CL/CLRuntimeContext.h b/arm_compute/runtime/CL/CLRuntimeContext.h
index dd17645fa7..2ed4b74796 100644
--- a/arm_compute/runtime/CL/CLRuntimeContext.h
+++ b/arm_compute/runtime/CL/CLRuntimeContext.h
@@ -54,11 +54,11 @@ public:
CLKernelLibrary &kernel_library();
private:
- std::unique_ptr<CLScheduler> _gpu_owned_scheduler{ nullptr };
- CLScheduler *_gpu_scheduler{ nullptr };
- CLTuner _tuner{ false };
+ std::unique_ptr<CLScheduler> _gpu_owned_scheduler{nullptr};
+ CLScheduler *_gpu_scheduler{nullptr};
+ CLTuner _tuner{false};
CLSymbols _symbols{};
- CLBackendType _backend_type{ CLBackendType::Native };
+ CLBackendType _backend_type{CLBackendType::Native};
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLRUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 56852aec6e..b74fcb74ef 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,8 @@
#include "arm_compute/core/CL/CLTypes.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
#include "arm_compute/runtime/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLTypes.h"
@@ -63,7 +63,9 @@ public:
* @param[in] gemm_h (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
* @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
*/
- void default_init(ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr, CLBackendType cl_backend_type = CLBackendType::Native);
+ void default_init(ICLTuner *cl_tuner = nullptr,
+ CLGEMMHeuristicsHandle *gemm_h = nullptr,
+ CLBackendType cl_backend_type = CLBackendType::Native);
/** Initialises the scheduler with context and device provided by the user
*
* @param[in] device OpenCL device to be used
@@ -71,22 +73,35 @@ public:
* @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
* @param[in] gemm_h (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
*/
- void default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr);
+ void default_init_with_context(cl::Device &device,
+ cl::Context &ctx,
+ ICLTuner *cl_tuner = nullptr,
+ CLGEMMHeuristicsHandle *gemm_h = nullptr);
+
+ /** Re-initializes the context and command queue used by the scheduler to default values
+ * and sets a default device and kernel path for the @ref CLKernelLibrary.
+ *
+ * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
+ * @param[in] gemm_h (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
+ * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
+ */
+ void default_reinit(ICLTuner *cl_tuner = nullptr,
+ CLGEMMHeuristicsHandle *gemm_h = nullptr,
+ CLBackendType cl_backend_type = CLBackendType::Native);
/** Schedule the execution of the passed kernel if possible.
*
* @param[in] kernel Kernel to execute.
- * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel.
+ * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled.
*/
void enqueue(ICLKernel &kernel, bool flush = true);
/** Schedule the execution of the passed kernel if possible.
*
* @param[in] kernel Kernel to execute.
* @param[in] tensors Vector containing the tensors to operate on.
- * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel.
+ * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled.
*/
void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush = true);
-
/** Initialises the context and command queue to be used by the scheduler.
*
* @param[in] context A CL context.
@@ -97,8 +112,12 @@ public:
* @param[in] gemm_h (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
* @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
*/
- void init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr,
- CLBackendType cl_backend_type = CLBackendType::Native);
+ void init(cl::Context context,
+ cl::CommandQueue queue,
+ const cl::Device &device,
+ ICLTuner *cl_tuner = nullptr,
+ CLGEMMHeuristicsHandle *gemm_h = nullptr,
+ CLBackendType cl_backend_type = CLBackendType::Native);
/** Accessor for the associated CL context.
*
@@ -163,10 +182,22 @@ public:
*/
void tune_kernel_static(ICLKernel &kernel);
+ /** Enable job chaining. The command queue will only be flushed when @p job_chaining_size kernels have been enqueued.
+ *
+ * @param[in] job_chaining_size Kernels to enqueue before flushing
+ */
+ void enable_job_chaining(int job_chaining_size);
+
bool is_initialised() const;
private:
void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush);
+ /** If job chain is disabled, then flush the command queue according to @p flush. Otherwise @p flush is ignored and the queue is only flushed when job chain count exceeds allocated job chain size
+ *
+ * @param[in] flush Flush the command queue. Ignored when job chain is enabled.
+ */
+ void flush_queue(bool flush);
+
/** Flag to ensure symbols initialisation is happening before Scheduler creation */
static std::once_flag _initialize_symbols;
@@ -177,6 +208,9 @@ private:
ICLTuner *_cl_tuner;
CLGEMMHeuristicsHandle *_gemm_heuristics;
CLBackendType _backend_type;
+ bool _job_chaining_enabled;
+ int _job_chaining_size;
+ int _job_chaining_count;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLSCHEDULER_H */
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
index 0a7f5f89b2..c18df8086a 100644
--- a/arm_compute/runtime/CL/CLSubTensor.h
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -46,7 +46,10 @@ public:
* @param[in] coords Coordinates of the first subtensor element inside the parent tensor.
* @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
*/
- CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent = false);
+ CLSubTensor(ICLTensor *parent,
+ const TensorShape &tensor_shape,
+ const Coordinates &coords,
+ bool extend_parent = false);
/** Destructor: free the tensor's memory */
~CLSubTensor() = default;
/** Restrict instances of this class to be copy constructed */
@@ -93,11 +96,11 @@ public:
protected:
// Inherited methods overridden:
uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
- void do_unmap(cl::CommandQueue &q) override;
+ void do_unmap(cl::CommandQueue &q) override;
private:
ICLTensor *_parent;
mutable SubTensorInfo _info;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_CLSUBTENSOR_H */
diff --git a/arm_compute/runtime/CL/CLTensor.h b/arm_compute/runtime/CL/CLTensor.h
index ae73351f27..0729935e9e 100644
--- a/arm_compute/runtime/CL/CLTensor.h
+++ b/arm_compute/runtime/CL/CLTensor.h
@@ -87,17 +87,17 @@ public:
TensorInfo *info() override;
const cl::Buffer &cl_buffer() const override;
CLQuantization quantization() const override;
- void associate_memory_group(IMemoryGroup *memory_group) override;
+ void associate_memory_group(IMemoryGroup *memory_group) override;
CLRuntimeContext *context();
protected:
// Inherited methods overridden:
uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
- void do_unmap(cl::CommandQueue &q) override;
+ void do_unmap(cl::CommandQueue &q) override;
private:
mutable CLTensorAllocator _allocator; /**< Instance of the OpenCL tensor allocator */
- CLRuntimeContext *_ctx{ nullptr };
+ CLRuntimeContext *_ctx{nullptr};
};
/** OpenCL Image */
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index 1b061ee1d6..fde8e9c43a 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -24,15 +24,14 @@
#ifndef ARM_COMPUTE_CLTENSORALLOCATOR_H
#define ARM_COMPUTE_CLTENSORALLOCATOR_H
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/runtime/CL/CLArray.h"
#include "arm_compute/runtime/CL/CLMemory.h"
#include "arm_compute/runtime/IAllocator.h"
#include "arm_compute/runtime/ITensorAllocator.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/CL/OpenCL.h"
-
#include <cstdint>
namespace arm_compute
@@ -148,7 +147,7 @@ private:
static const cl::Buffer _empty_buffer;
private:
- CLRuntimeContext *_ctx;
+ CLRuntimeContext *_ctx;
IMemoryManageable *_owner; /**< Memory manageable object that owns the allocator */
IMemoryGroup *_associated_memory_group; /**< Registered memory manager */
CLMemory _memory; /**< OpenCL memory */
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index f1bae3f8dc..cf293d3d27 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -75,24 +75,6 @@ public:
*/
void set_tuner_mode(CLTunerMode mode);
- /** Get the current OpenCL tuner mode
- *
- * @return tuner_mode Indicates how exhaustive the search for the optimal tuning parameters should be while tuning
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.08 release
- */
- CLTunerMode get_tuner_mode() const;
-
- /** Manually add a LWS for a kernel
- *
- * @param[in] kernel_id Unique identifiant of the kernel
- * @param[in] optimal_lws Optimal local workgroup size to use for the given kernel
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.08 release
- */
- ARM_COMPUTE_DEPRECATED_REL_REPLACE(21.02, add_tuning_params)
- void add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws);
-
/** Manually add tuning parameters for a kernel
*
* @param[in] kernel_id Unique identifiant of the kernel
@@ -100,30 +82,12 @@ public:
*/
void add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params);
- /** Import LWS table
- *
- * @param[in] lws_table The unordered_map container to import
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.08 release
- */
- ARM_COMPUTE_DEPRECATED_REL_REPLACE(21.02, import_tuning_params)
- void import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table);
-
/** Import tuning parameters table
*
* @param[in] tuning_params_table The unordered_map container to import
*/
void import_tuning_params(const std::unordered_map<std::string, CLTuningParams> &tuning_params_table);
- /** Give read access to the LWS table
- *
- * @return The lws table as unordered_map container
- *
- * @deprecated This function is deprecated and is intended to be removed in 21.08 release
- */
- ARM_COMPUTE_DEPRECATED_REL_REPLACE(21.02, tuning_params_table)
- const std::unordered_map<std::string, cl::NDRange> &lws_table();
-
/** Give read access to the tuning params table
*
* @return The tuning params table as unordered_map container
@@ -160,28 +124,38 @@ public:
void tune_kernel_static(ICLKernel &kernel) override;
void tune_kernel_dynamic(ICLKernel &kernel) override;
void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
-
/** Is the kernel_event set ?
*
* @return true if the kernel_event is set.
*/
bool kernel_event_is_set() const;
+ /** A wrapper wrapping tensors and other objects needed for running the kernel
+ */
+ struct IKernelData;
+
private:
+ /** Perform tune_kernel_dynamic
+ *
+ * @param[in] kernel OpenCL kernel to be tuned with tuning parameters
+ * @param[in,out] data IKernelData object wrapping tensors and other objects needed for running the kernel
+ *
+ */
+ void do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data);
/** Find optimal tuning parameters using brute-force approach
*
- * @param[in] kernel OpenCL kernel to be tuned with tuning parameters
- * @param[in,out] tensors Tensors for the kernel to operate on
+ * @param[in] kernel OpenCL kernel to be tuned with tuning parameters
+ * @param[in,out] data IKernelData object wrapping tensors and other objects needed for running the kernel
*
* @return The optimal tuning parameters to use
*/
- CLTuningParams find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors);
+ CLTuningParams find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data);
std::unordered_map<std::string, CLTuningParams> _tuning_params_table;
std::unordered_map<std::string, cl::NDRange> _lws_table;
- cl::Event _kernel_event;
- bool _tune_new_kernels;
- CLTuningInfo _tuning_info;
+ cl::Event _kernel_event;
+ bool _tune_new_kernels;
+ CLTuningInfo _tuning_info;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLTUNER_H */
diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h
index 508cafac95..d9b914676a 100644
--- a/arm_compute/runtime/CL/CLTunerTypes.h
+++ b/arm_compute/runtime/CL/CLTunerTypes.h
@@ -43,7 +43,7 @@ enum class CLTunerMode
struct CLTuningInfo
{
CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */
- bool tune_wbsm = false; /**< Flag to tune the batches of work groups distributed to compute units.
+ bool tune_wbsm = false; /**< Flag to tune the batches of work groups distributed to compute units.
Internally, the library will check if this feature is available on
the target platform. This OpenCL tuner extension is still in experimental phase */
};
@@ -56,11 +56,10 @@ struct CLTuningInfo
*/
inline CLTunerMode tuner_mode_from_name(const std::string &name)
{
- static const std::map<std::string, CLTunerMode> tuner_modes =
- {
- { "exhaustive", CLTunerMode::EXHAUSTIVE },
- { "normal", CLTunerMode::NORMAL },
- { "rapid", CLTunerMode::RAPID },
+ static const std::map<std::string, CLTunerMode> tuner_modes = {
+ {"exhaustive", CLTunerMode::EXHAUSTIVE},
+ {"normal", CLTunerMode::NORMAL},
+ {"rapid", CLTunerMode::RAPID},
};
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -71,7 +70,7 @@ inline CLTunerMode tuner_mode_from_name(const std::string &name)
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
- catch(const std::out_of_range &)
+ catch (const std::out_of_range &)
{
throw std::invalid_argument(name);
}
diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h
index b50481336b..a876fad112 100644
--- a/arm_compute/runtime/CL/CLTuningParams.h
+++ b/arm_compute/runtime/CL/CLTuningParams.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/runtime/CL/CLTunerTypes.h"
+
#include "support/StringSupport.h"
#include <ostream>
@@ -36,20 +37,26 @@ namespace arm_compute
class CLTuningParams
{
public:
- CLTuningParams(const CLTuningParams &) = default;
+ CLTuningParams(const CLTuningParams &tuning_params) : _lws(tuning_params._lws), _wbsm(tuning_params._wbsm)
+ {
+ }
CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0, int wbsm = 0)
: _lws(lws_x, lws_y, lws_z), _wbsm(wbsm)
{
}
- CLTuningParams(cl::NDRange lws, cl_int wbsm = 0)
- : _lws(lws), _wbsm(wbsm)
+ CLTuningParams(cl::NDRange lws, cl_int wbsm = 0) : _lws(lws), _wbsm(wbsm)
{
}
- CLTuningParams(cl_int wbsm)
- : CLTuningParams(cl::NullRange, wbsm)
+ CLTuningParams(cl_int wbsm) : CLTuningParams(cl::NullRange, wbsm)
+ {
+ }
+ CLTuningParams &operator=(const CLTuningParams &other)
{
+ _lws = other._lws;
+ _wbsm = other._wbsm;
+ return *this;
}
void set_lws(cl::NDRange lws)
@@ -75,8 +82,9 @@ public:
std::string to_string(CLTuningInfo tuning_info)
{
std::string tuning_params_string = "";
- tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) + ";" + support::cpp11::to_string(_lws[2]);
- if(tuning_info.tune_wbsm)
+ tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) +
+ ";" + support::cpp11::to_string(_lws[2]);
+ if (tuning_info.tune_wbsm)
{
tuning_params_string += ";" + support::cpp11::to_string(_wbsm);
}
@@ -89,19 +97,19 @@ public:
std::vector<std::string> array;
std::stringstream ss(tuning_params_string);
std::string temp;
- while(ss >> temp)
+ while (ss >> temp)
{
array.push_back(temp);
}
// Read 3 values for lws
- if(array.size() < 3)
+ if (array.size() < 3)
{
return false;
}
const unsigned int lws_0 = support::cpp11::stoi(array[0]);
const unsigned int lws_1 = support::cpp11::stoi(array[1]);
const unsigned int lws_2 = support::cpp11::stoi(array[2]);
- if(lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
+ if (lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
{
// If lws values are 0, cl::NullRange has to be used
// otherwise the lws object will be badly created
@@ -112,9 +120,9 @@ public:
_lws = cl::NDRange(lws_0, lws_1, lws_2);
}
array.erase(array.begin(), array.begin() + 3);
- if(tuning_info.tune_wbsm)
+ if (tuning_info.tune_wbsm)
{
- if(array.size() < 1)
+ if (array.size() < 1)
{
return false;
}
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index cf0486c8c3..931740c47f 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,33 +30,25 @@ namespace arm_compute
/** OpenCL GEMM kernel types */
enum class CLGEMMKernelType
{
- /** Native GEMM kernel with fixed block size.
- * @note Temporary variant to keep compatibility with the old implementation.
- * @note This variant will be deprecated in favor of a new and configurable NATIVE variant
- */
- NATIVE_V1,
/** Native GEMM kernel with configurable block size.*/
NATIVE,
- /** Reshaped GEMM kernel where both lhs and rhs matrices are reshaped. Fixed block size fixed.
- * @note Temporary variant to keep compatibility with the old implementation.
- * @note This variant will be deprecated in favor of RESHAPED
- */
- RESHAPED_V1,
/** Reshaped GEMM kernel where both lhs and rhs matrices are reshaped. Configurable reshape and block size */
RESHAPED,
/** Reshaped GEMM kernel where only the rhs matrix is reshaped. Configurable reshape and block size */
- RESHAPED_ONLY_RHS
+ RESHAPED_ONLY_RHS,
+ /** Reshaped GEMM kernel where only the rhs matrix is reshaped. Using MMUL with configurable block size. */
+ RESHAPED_ONLY_RHS_MMUL
};
/** OpenCL GEMM kernel selection parameters. These information are retrieved to select the GEMM kernel on OpenCL */
struct CLGEMMKernelSelectionParams
{
- unsigned int m{ 0 }; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
- unsigned int n{ 0 }; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
- unsigned int k{ 0 }; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
- unsigned int b{ 0 }; /**< Batch size */
- bool is_rhs_constant{ false }; /**< True if the content of the rhs matrix is constant */
- DataType data_type{ DataType::UNKNOWN }; /**< Data type */
+ unsigned int m{0}; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+ unsigned int n{0}; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+ unsigned int k{0}; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+ unsigned int b{0}; /**< Batch size */
+ bool is_rhs_constant{false}; /**< True if the content of the rhs matrix is constant */
+ DataType data_type{DataType::UNKNOWN}; /**< Data type */
};
/** List the possible OpenCL backends */
diff --git a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
index 7be9393388..5a71a61203 100644
--- a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
+++ b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
@@ -40,8 +40,7 @@ public:
*
* @param[in] arch GPU target
*/
- ICLGEMMKernelSelection(GPUTarget arch)
- : _target(arch)
+ ICLGEMMKernelSelection(GPUTarget arch) : _target(arch)
{
}
/** Default Move Constructor. */
@@ -59,7 +58,8 @@ public:
virtual CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) = 0;
protected:
- GPUTarget _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
+ GPUTarget
+ _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
};
} // namespace cl_gemm
} // namespace arm_compute
diff --git a/arm_compute/runtime/CL/ICLOperator.h b/arm_compute/runtime/CL/ICLOperator.h
index 38bcaf32f2..c0826e7733 100644
--- a/arm_compute/runtime/CL/ICLOperator.h
+++ b/arm_compute/runtime/CL/ICLOperator.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_ICLOPERATOR_H
#include "arm_compute/core/Types.h"
-
#include "arm_compute/runtime/IOperator.h"
#include "arm_compute/runtime/IRuntimeContext.h"
#include "arm_compute/runtime/Types.h"
@@ -56,8 +55,8 @@ public:
ICLOperator &operator=(ICLOperator &&) = default;
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
MemoryRequirements workspace() const override;
protected:
diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
index 0f951c384e..fa7a1424b8 100644
--- a/arm_compute/runtime/CL/ICLTuner.h
+++ b/arm_compute/runtime/CL/ICLTuner.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index e6bb192532..e158efa093 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_CLACTIVATIONLAYER_H
#define ARM_COMPUTE_CLACTIVATIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/runtime/IFunction.h"
namespace arm_compute
{
@@ -90,7 +90,10 @@ public:
* @param[out] output Destination tensor. Data type supported: same as @p input
* @param[in] act_info Activation layer parameters.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ ActivationLayerInfo act_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index a971163c45..d340d20a1f 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,7 +91,11 @@ public:
* @param[out] output Output source tensor. Data types supported: U32/S32.
* @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ int axis,
+ ICLTensor *output,
+ const ReductionOperation &op);
/** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer
*
* @param[in] input Input source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
@@ -107,13 +111,11 @@ public:
void run() override;
private:
- MemoryGroup _memory_group;
- std::vector<CLTensor> _results_vector;
- CLTensor _not_reshaped_output;
- std::vector<std::unique_ptr<CLArgMinMaxLayerKernel>> _reduction_kernels_vector;
- CLReshapeLayer _reshape;
- unsigned int _num_of_stages;
- unsigned int _reduction_axis;
+ MemoryGroup _memory_group;
+ CLTensor _not_reshaped_output;
+ std::unique_ptr<CLArgMinMaxLayerKernel> _arg_min_max_kernel;
+ CLReshapeLayer _reshape;
+ unsigned int _reduction_axis;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLARGMINMAXLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index fcfeb5ea3b..f57bc8fe8b 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,9 @@
#ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
#define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -83,7 +83,13 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+ void configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
ActivationLayerInfo act_info = ActivationLayerInfo());
/** Set the input and output tensors.
*
@@ -101,9 +107,15 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
- const ICLTensor *gamma = nullptr,
- float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta = nullptr,
+ const ICLTensor *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayer
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -119,10 +131,14 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
- float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta = nullptr,
+ const ITensorInfo *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
index f6ba2b0b02..20b9fdafed 100644
--- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLBATCHTOSPACELAYER_H
#define ARM_COMPUTE_CLBATCHTOSPACELAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -67,7 +66,10 @@ public:
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
* @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
+ ARM_COMPUTE_DEPRECATED_REL(23.05)
void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
/** Set the input and output tensors.
*
@@ -75,16 +77,27 @@ public:
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
* @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+ ARM_COMPUTE_DEPRECATED_REL(23.05)
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output);
/** Set the input and output tensors. (Static block shape).
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Information about how the output shape is cropped after batch to space is performed
*/
- void configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output);
+ void configure(const ICLTensor *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info = CropInfo{});
/** Set the input and output tensors. (Static block shape).
*
* @param[in] compile_context The compile context to be used.
@@ -92,8 +105,14 @@ public:
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Information about how the output shape is cropped after batch to space is performed
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info = CropInfo{});
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -101,7 +120,10 @@ public:
* @param[out] output Tensor output info. Data types supported: same as @p input
*
* @return a status
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
+ ARM_COMPUTE_DEPRECATED_REL(23.05)
static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer (Static block shape).
*
@@ -109,10 +131,15 @@ public:
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[out] output Tensor output info. Data types supported: same as @p input
+ * @param[in] crop_info Information about how the output shape is cropped after batch to space is performed
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info = CropInfo{});
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
index b30be9b24f..f82af3af9b 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -61,7 +61,10 @@ public:
* @param[in] input2 Input tensor. Data types supported: U8.
* @param[out] output Output tensor. Data types supported: U8.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output);
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLBITWISEAND_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
index 1456ebe57e..31f8e86802 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -60,5 +60,5 @@ public:
*/
void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLBITWISENOT_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
index ff0a1f0d73..9a25a2099e 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -61,7 +61,10 @@ public:
* @param[in] input2 Input tensor. Data types supported: U8.
* @param[out] output Output tensor. Data types supported: U8.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output);
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLBITWISEOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
index 0cd9d073b4..9e288ef7b6 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -61,7 +61,10 @@ public:
* @param[in] input2 Input tensor. Data types supported: U8.
* @param[out] output Output tensor. Data types supported: U8.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output);
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLBITWISEXOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
index d3499c3949..dba5497f5d 100644
--- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
+++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
@@ -64,7 +64,10 @@ public:
*
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*/
- void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+ void configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -76,7 +79,11 @@ public:
*
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
*
@@ -90,7 +97,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+ static Status validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index d2cea7a8a2..9433f08fac 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLCAST_H
#define ARM_COMPUTE_CLCAST_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -61,43 +60,29 @@ public:
* |src |dst |
* |:--------------|:--------------------------------------|
* |U8 | S8, U16, S16, U32, S32, F16, F32 |
+ * |S8 | U8, U16, S16, U32, S32, F16, F32 |
* |U16 | U8, S8, S16, U32, S32, F16, F32 |
* |S16 | U8, S8, U16, U32, S32, F16, F32 |
* |U32 | U8, S8, U16, S16, S32, F16, F32 |
* |S32 | U8, S8, U16, S16, U32, F16, F32 |
- * |F16 | U8, S8, U16, S16, U32, F32 |
- * |F32 | U8, S8, U16, S16, U32, F16 |
+ * |U64 | U8, S8, U16, S16, U32, S32, F16, F32 |
+ * |S64 | U8, S8, U16, S16, U32, S32, F16, F32 |
+ * |F16 | U8, S8, U16, S16, S32, U32, F32 |
+ * |F32 | U8, S8, U16, S16, S32, U32, F16 |
*
* Input data type must be different than output data type.
*
- * @param[in] input The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in] input The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
* @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
* @param[in] policy Conversion policy.
*/
void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
- /** Initialize the function's source, destination
- *
- * Input data type must be different than output data type.
- *
- * Valid conversions Input -> Output :
- *
- * - U8 -> S8, U16, S16, U32, S32, F16, F32
- * - U16 -> U8, S8, S16, U32, S32, F16, F32
- * - S16 -> U8, S8, U16, U32, S32, F16, F32
- * - U32 -> U8, S8, U16, S16, S32, F16, F32
- * - S32 -> U8, S8, U16, S16, U32, F16, F32
- * - F16 -> U8, S8, U16, S16, U32, F32
- * - F32 -> U8, S8, U16, S16, U32, F16
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
- * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
- * @param[in] policy Conversion policy.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
+ // Initialize the function's source, destination
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
/** Static function to check if given info will lead to a valid configuration of @ref CLCast
*
- * @param[in] input Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in] input Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
* @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
* @param[in] policy Conversion policy.
*
diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
index d60548d9cc..8ca848a020 100644
--- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
@@ -46,6 +46,7 @@ public:
*
* Valid data layouts:
* - NCHW
+ * - NHWC
*
* Valid data type configurations:
* |src |dst |
@@ -64,7 +65,10 @@ public:
* @param[out] output Output tensor. Data type supported: Same as @p input
* @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups);
/** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
*
* @param[in] input Input tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index 3f984900ee..fca4b168b0 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h
@@ -66,7 +66,11 @@ public:
* @param[out] output Destination tensor. Data types supported: U8.
* @param[out] operation Comparison operation to be used.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation);
/** Static function to check if given info will lead to a valid configuration of @ref CLComparison
*
* @param[in] input1 Source tensor. Data types supported: All.
@@ -76,7 +80,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation);
};
/** Basic function to run @ref CLComparisonKernel */
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index 71e84e21b5..88c4bed595 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLCONCATENATELAYER_H
#define ARM_COMPUTE_CLCONCATENATELAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
#include <vector>
@@ -95,7 +94,10 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input.
* @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
*/
- void configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
+ void configure(const CLCompileContext &compile_context,
+ std::vector<const ICLTensor *> &inputs_vector,
+ ICLTensor *output,
+ size_t axis);
/** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
*
* @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
@@ -108,7 +110,8 @@ public:
*
* @return a status
*/
- static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+ static Status
+ validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLConv3D.h b/arm_compute/runtime/CL/functions/CLConv3D.h
new file mode 100644
index 0000000000..aabaf01ab7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLConv3D.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONVOLUTION3DLAYER_H
+#define ARM_COMPUTE_CLCONVOLUTION3DLAYER_H
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+struct Conv3dInfo;
+class Status;
+
+/** Basic function to compute the convolution3d layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::ClDirectConv3d
+ */
+class CLConv3D : public IFunction
+{
+public:
+ /** Construtor */
+ CLConv3D();
+ /** Destructor */
+ ~CLConv3D();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLConv3D(const CLConv3D &) = delete;
+ /** Default move constructor */
+ CLConv3D(CLConv3D &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLConv3D &operator=(const CLConv3D &) = delete;
+ /** Default move assignment operator */
+ CLConv3D &operator=(CLConv3D &&) = default;
+ /** Set the src and dst tensors.
+ *
+ * Valid data layouts:
+ * - NDHWC
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:--------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
+ * while every optional dimension from 5 and above represent a batch of srcs.
+ * @param[in] weights Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_w, kernel_h, kernel_d].
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * @param[out] dst Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
+ * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
+ *
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *src,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *dst,
+ const Conv3dInfo &conv3d_info);
+ /** Set the src and dst tensors.
+ *
+ * Similar to CLConv3D::configure() but using the default compile context
+ *
+ */
+ void configure(const ICLTensor *src,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *dst,
+ const Conv3dInfo &conv3d_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLConv3D
+ *
+ * Similar to CLConv3D::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCONVOLUTION3DLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
index 6c7d9e52e8..409430d595 100644
--- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,10 +68,11 @@ public:
* @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input.
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
- *
- * @return A status
*/
- void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+ void configure(const ICLTensor *input,
+ ICLTensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout);
/** Initialize the function.
*
* @param[in] compile_context The compile context to be used.
@@ -79,10 +80,12 @@ public:
* @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input.
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
- *
- * @return A status
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout);
/** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeights
*
* @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -90,7 +93,10 @@ public:
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout);
// Inherited methods overridden:
void run() override;
@@ -148,7 +154,10 @@ public:
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
_func.configure(compile_context, input, &_output, original_input_shape, data_layout);
}
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 6884754d83..8487be71c3 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,13 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
-#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
@@ -35,11 +35,15 @@
namespace arm_compute
{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
*
- * -# @ref CLGEMMConvolutionLayer
- * -# @ref CLWinogradConvolutionLayer
- * -# @ref CLDirectConvolutionLayer
+ * -# @ref opencl::ClGemmConv2d
+ * -# @ref opencl::ClWinogradConv2d
+ * -# @ref opencl::ClDirectConv2d
* -# @ref CLFFTConvolutionLayer
*
* The function selects one of the algorithms mentioned above based on:
@@ -116,8 +120,16 @@ public:
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -138,9 +150,17 @@ public:
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
- unsigned int num_groups = 1);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -162,9 +182,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
- unsigned int num_groups = 1);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -182,17 +209,24 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*
- * @return a status
+ * @return the Convolution Method Hint
*/
- static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false);
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const ActivationLayerInfo &act_info,
+ const GPUTarget gpu_target,
+ const Size2D &dilation = Size2D(1U, 1U),
+ bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
void prepare() override;
private:
- std::shared_ptr<IMemoryManager> _memory_manager;
- std::unique_ptr<IFunction> _function;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
-}
-#endif /* ARM_COMPUTE_CLCONVOLUTIONLAYER_H */
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h
index 4fc4183d3e..fd40b7b9de 100644
--- a/arm_compute/runtime/CL/functions/CLCopy.h
+++ b/arm_compute/runtime/CL/functions/CLCopy.h
@@ -27,6 +27,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
@@ -74,7 +75,10 @@ public:
* @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
*
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window = nullptr);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ Window *dst_window = nullptr);
/** Static function to check if given info will lead to a valid configuration of @ref CLCopy
*
* @param[in] input Source tensor. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLCrop.h b/arm_compute/runtime/CL/functions/CLCrop.h
index d2b72a5eff..2942e9362a 100644
--- a/arm_compute/runtime/CL/functions/CLCrop.h
+++ b/arm_compute/runtime/CL/functions/CLCrop.h
@@ -21,12 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_CROP_H
-#define ARM_COMPUTE_CL_CROP_H
+#ifndef ARM_COMPUTE_CLCROP_H
+#define ARM_COMPUTE_CLCROP_H
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
@@ -71,7 +72,13 @@ public:
* @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
* @param[in] output_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
*/
- void configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *output_window = nullptr);
+ void configure(const ICLTensor *input,
+ ICLTensor *output,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value = 0,
+ Window *output_window = nullptr);
/** Configure function
*
* @note Supported tensor rank: up to 4
@@ -85,8 +92,14 @@ public:
* @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
* @param[in] output_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
- Window *output_window = nullptr);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value = 0,
+ Window *output_window = nullptr);
/** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
*
@@ -100,8 +113,13 @@ public:
* @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
* @param[in] output_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
- Window *output_window = nullptr);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value = 0,
+ Window *output_window = nullptr);
// Inherited methods overridden:
void run() override;
@@ -111,4 +129,4 @@ private:
std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_CROP_H */
+#endif /*ARM_COMPUTE_CLCROP_H */
diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index 5c60c2879c..6fb055e893 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_CL_CROP_RESIZE_H
#include "arm_compute/core/CL/ICLTensor.h"
-
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLCopy.h"
#include "arm_compute/runtime/CL/functions/CLCrop.h"
@@ -82,8 +81,13 @@ public:
* @param[in] method The policy to be used when resizing image. Default is bilinear.
* @param[in] extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
*/
- void configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
- InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+ void configure(const ICLTensor *input,
+ ICLTensor *boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method = InterpolationPolicy::BILINEAR,
+ float extrapolation_value = 0);
/** Configure kernel
*
* @note Supported tensor rank: up to 4
@@ -100,8 +104,14 @@ public:
* @param[in] method The policy to be used when resizing image. Default is bilinear.
* @param[in] extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
- InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method = InterpolationPolicy::BILINEAR,
+ float extrapolation_value = 0);
/** Static function to check if given info will lead to a valid configuration of @ref NESlice
*
@@ -121,8 +131,13 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
- Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+ static Status validate(const ITensorInfo *input,
+ ITensorInfo *boxes,
+ ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value);
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 2dd4cd4bf5..92f87ee461 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,6 +44,8 @@ public:
/** Default constructor */
CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ ~CLDeconvolutionLayer();
+
/** Set the input, weights, biases and output tensors.
*
* Valid data layouts:
@@ -65,10 +67,15 @@ public:
* @param[in] bias (Optional) The biases have one dimension. Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
* @param[out] output Output tensor. The output has the same number of dimensions as the @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
*
*/
- void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, const WeightsInfo &weights_info = WeightsInfo());
+ void configure(ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info = WeightsInfo());
/** Set the input, weights, biases and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -77,11 +84,16 @@ public:
* @param[in] bias (Optional) The biases have one dimension. Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
* @param[out] output Output tensor. The output has the same number of dimensions as the @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
*
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info = WeightsInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
*
* @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
@@ -89,15 +101,23 @@ public:
* @param[in] bias (Optional) The biases have one dimension. Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
* @param[in] output Output tensor info. The output has the same number of dimensions as the @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info = WeightsInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info = WeightsInfo());
- static DeconvolutionMethod get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info);
+ static DeconvolutionMethod get_deconvolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info);
// Inherited methods overridden:
void run() override;
void prepare() override;
@@ -105,6 +125,9 @@ public:
private:
std::shared_ptr<IMemoryManager> _memory_manager;
std::unique_ptr<IFunction> _function;
+
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLDECONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 344ebd0afb..5a2abafe79 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -82,7 +82,8 @@ public:
* @param[out] output Destination tensor. Data type supported: same as @p input.
* @param[in] info Contains padding and policies to be used in the deconvolution.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+ void
+ configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
*
* @param[in] input Source tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index 58deb7ec40..3e7ca8830b 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLDEPTHCONVERT_H
#define ARM_COMPUTE_CLDEPTHCONVERT_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -96,7 +95,11 @@ public:
* @param[in] policy Conversion policy.
* @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ ConvertPolicy policy,
+ uint32_t shift);
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayer
*
* @param[in] input Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index 0026cc2b67..14d0a7ec7c 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
@@ -60,7 +60,8 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] block_shape Block shape value.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayer.
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -71,5 +72,5 @@ public:
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index f31a17d9cb..2c0fa7aa22 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLPermute.h"
#include "arm_compute/runtime/IFunction.h"
@@ -33,13 +34,14 @@
namespace arm_compute
{
class CLCompileContext;
-class CLFillBorderKernel;
class CLDepthwiseConvolutionLayerNativeKernel;
-class CLDepthwiseConvolutionLayer3x3NCHWKernel;
-class CLDepthwiseConvolutionLayer3x3NHWCKernel;
class ICLTensor;
/** Function to execute a depthwise convolution
+ *
+ * -# @ref CLDepthwiseConvolutionLayerNativeKernel
+ * -# @ref CLPermute (if the data layout is NCHW)
+ *
*/
class CLDepthwiseConvolutionLayer : public IFunction
{
@@ -72,284 +74,87 @@ public:
* |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
* |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
*
+ * @param[in] compile_context The compile context to be used.
* @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
* @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
* Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
* @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
* Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[out] output Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: same as @p input.
* @param[in] conv_info Padding and stride information to use for the convolution.
* @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ *
+ * @note: For in-place support, please check @ref CLDepthwiseConvolutionLayerNativeKernel
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ ActivationLayerInfo act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
+
/** Initialize the function's source, destination, weights and convolution information.
*
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
- * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * Similar to @ref CLDepthwiseConvolutionLayer::configure()
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ ActivationLayerInfo act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * Similar to @ref CLDepthwiseConvolutionLayer::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ ActivationLayerInfo act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
// Inherited methods overriden:
void run() override;
void prepare() override;
-private:
- /** Static function to choose the best depthwise convolution function for @ref CLDepthwiseConvolutionLayer
- *
- * @param[in] input Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8.
- * @param[in] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- *
- * @return a Depthwise Convolution Function
- */
- static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
- /** Basic function to execute a depthwise convolution for kernel size 3x3xC (when data layout NCHW) or Cx3x3 (when data layout NHWC). This function calls the following OpenCL kernels:
- *
- * -# @ref CLDepthwiseConvolutionLayer3x3NCHWKernel (if data_layout == NCHW)
- * -# @ref CLDepthwiseConvolutionLayer3x3NHWCKernel (if data_layout == NHWC)
- * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0)
- *
- */
- class CLDepthwiseConvolutionLayerInternal3x3 : public IFunction
+ void set_memory_group(std::shared_ptr<IMemoryManager> memory_manager)
{
- public:
- /** Default constructor */
- CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthwiseConvolutionLayerInternal3x3(const CLDepthwiseConvolutionLayerInternal3x3 &) = delete;
- /** Default move constructor */
- CLDepthwiseConvolutionLayerInternal3x3(CLDepthwiseConvolutionLayerInternal3x3 &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthwiseConvolutionLayerInternal3x3 &operator=(const CLDepthwiseConvolutionLayerInternal3x3 &) = delete;
- /** Default move assignment operator */
- CLDepthwiseConvolutionLayerInternal3x3 &operator=(CLDepthwiseConvolutionLayerInternal3x3 &&) = default;
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM].
- * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- */
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
- /** Initialize the function's source, destination, conv and border_size.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM].
- * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- */
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
- /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3
- *
- * @param[in] input Source tensor info. Data type supported: QASYMM8 for all layouts, F16/F32 for NCHW.
- * @param[in] weights Weights tensor info. A 3D tensor with shape [3, 3, IFM].
- * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8.
- * @param[in] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
- ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
- // Inherited methods overriden:
- void run() override;
- void prepare() override;
-
- void set_memory_group(std::shared_ptr<IMemoryManager> memory_manager)
- {
- _memory_group = MemoryGroup(std::move(memory_manager));
- };
-
- private:
- MemoryGroup _memory_group;
- std::unique_ptr<CLDepthwiseConvolutionLayer3x3NCHWKernel> _kernel_nchw;
- std::unique_ptr<CLDepthwiseConvolutionLayer3x3NHWCKernel> _kernel_nhwc;
- std::unique_ptr<CLFillBorderKernel> _border_handler;
- CLPermute _permute_input_to_nchw;
- CLPermute _permute_weights_to_nchw;
- CLPermute _permute_output_to_nhwc;
- CLTensor _permuted_input;
- CLTensor _permuted_weights;
- CLTensor _permuted_output;
- CLTensor _output_multipliers;
- CLTensor _output_shifts;
- const ITensor *_original_weights;
- const ITensor *_input;
- const ITensor *_output;
- bool _needs_permute;
- bool _is_prepared;
- bool _is_quantized;
- bool _is_nhwc;
+ _memory_group = MemoryGroup(std::move(memory_manager));
};
- /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
- *
- * -# @ref CLDepthwiseConvolutionLayerNativeKernel
- * -# @ref CLPermute (x 3) if the data layout is NCHW
- *
- */
- class CLDepthwiseConvolutionLayerGeneric : public IFunction
- {
- public:
- /** Default constructor */
- CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthwiseConvolutionLayerGeneric(const CLDepthwiseConvolutionLayerGeneric &) = delete;
- /** Default move constructor */
- CLDepthwiseConvolutionLayerGeneric(CLDepthwiseConvolutionLayerGeneric &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLDepthwiseConvolutionLayerGeneric &operator=(const CLDepthwiseConvolutionLayerGeneric &) = delete;
- /** Default move assignment operator */
- CLDepthwiseConvolutionLayerGeneric &operator=(CLDepthwiseConvolutionLayerGeneric &&) = default;
- /** Initialize the function's source, destination, weights and convolution information.
- *
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- */
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
- /** Initialize the function's source, destination, weights and convolution information.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- */
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
- /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerGeneric
- *
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F32.
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor. Data type supported: same as @p input.
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
- // Inherited methods overriden:
- void run() override;
- void prepare() override;
-
- void set_memory_group(std::shared_ptr<IMemoryManager> memory_manager)
- {
- _memory_group = MemoryGroup(std::move(memory_manager));
- };
-
- private:
- MemoryGroup _memory_group;
-
- std::unique_ptr<CLDepthwiseConvolutionLayerNativeKernel> _dwc_native_kernel;
- CLPermute _permute_input_to_nhwc;
- CLPermute _permute_weights_to_nhwc;
- CLPermute _permute_output_to_nchw;
-
- CLTensor _permuted_input;
- CLTensor _permuted_weights;
- CLTensor _permuted_output;
- CLTensor _output_multipliers;
- CLTensor _output_shifts;
- const ITensor *_original_weights;
- const ITensor *_input;
- const ITensor *_output;
-
- bool _needs_permute;
- bool _is_prepared;
- bool _is_quantized;
- };
-
- std::shared_ptr<IMemoryManager> _memory_manager;
-
- DepthwiseConvolutionFunction _depth_conv_func;
- CLDepthwiseConvolutionLayerInternal3x3 _func_3x3;
- CLDepthwiseConvolutionLayerGeneric _func_generic;
+private:
+ MemoryGroup _memory_group;
+
+ std::unique_ptr<CLDepthwiseConvolutionLayerNativeKernel> _dwc_native_kernel;
+ CLPermute _permute_input_to_nhwc;
+ CLPermute _permute_weights_to_nhwc;
+ CLPermute _permute_output_to_nchw;
+
+ CLTensor _permuted_input;
+ CLTensor _permuted_weights;
+ CLTensor _permuted_output;
+ CLTensor _output_multipliers;
+ CLTensor _output_shifts;
+ const ITensor *_original_weights;
+ const ITensor *_input;
+ const ITensor *_output;
+
+ bool _needs_permute;
+ bool _is_prepared;
+ bool _is_quantized;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H */
diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
index 3fd0c63782..84900b03a3 100644
--- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/IFunction.h"
@@ -78,7 +79,12 @@ public:
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -93,7 +99,12 @@ public:
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayer
*
@@ -110,7 +121,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
@@ -120,5 +135,5 @@ private:
struct Impl;
std::unique_ptr<Impl> _impl;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index 567de13508..14384a09b5 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
@@ -24,12 +24,11 @@
#ifndef ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
#define ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
+#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
#include "arm_compute/runtime/CL/functions/CLReverse.h"
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
@@ -108,10 +107,15 @@ public:
* Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
* @param[out] output Output tensor. The output has the same number of dimensions as the @p input.
* @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
*
*/
- void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, const WeightsInfo &weights_info = WeightsInfo());
+ void configure(ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info = WeightsInfo());
/** Set the input, weights, biases and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -122,11 +126,16 @@ public:
* Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
* @param[out] output Output tensor. The output has the same number of dimensions as the @p input.
* @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
*
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info = WeightsInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLDirectDeconvolutionLayer
*
* @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -136,12 +145,16 @@ public:
* Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
* @param[in] output Output tensor info. The output has the same number of dimensions as the @p input.
* @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info = WeightsInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info = WeightsInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 555e84a251..13844c98a1 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
#define ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/ICLOperator.h"
#include "arm_compute/runtime/IFunction.h"
@@ -81,7 +82,11 @@ public:
* @param[in] policy Policy to use to handle overflow.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output and conversion policy.
*
* Valid configurations (Input1,Input2) -> Output :
@@ -107,7 +112,11 @@ public:
* @param[in] policy Policy to use to handle overflow.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for addition
*
@@ -133,7 +142,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -191,7 +204,11 @@ public:
* @param[in] policy Policy to use to handle overflow.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output and conversion policy.
*
* Valid configurations (Input1,Input2) -> Output :
@@ -217,7 +234,11 @@ public:
* @param[in] policy Policy to use to handle overflow.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for subtraction
*
@@ -243,7 +264,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -291,7 +316,10 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output.
*
* @param[in] compile_context The compile context to be used.
@@ -302,7 +330,11 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
*
* @param[in] input1 First tensor input info. Data types supported: F16/F32.
@@ -312,7 +344,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -367,7 +402,10 @@ public:
* @param[out] output Output tensor. Data types supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output and conversion policy.
*
* @param[in] compile_context The compile context to be used.
@@ -378,7 +416,11 @@ public:
* @param[out] output Output tensor. Data types supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for max
*
* @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -388,7 +430,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -443,7 +488,10 @@ public:
* @param[out] output Output tensor. Data types supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output and conversion policy.
*
* @param[in] compile_context The compile context to be used.
@@ -454,7 +502,11 @@ public:
* @param[out] output Output tensor. Data types supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for min
*
* @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -464,7 +516,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -517,7 +572,10 @@ public:
* @param[out] output Output tensor. Data types supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output and conversion policy.
*
* @param[in] compile_context The compile context to be used.
@@ -528,7 +586,11 @@ public:
* @param[out] output Output tensor. Data types supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for squared difference
*
* @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
@@ -538,7 +600,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -586,7 +651,10 @@ public:
* @param[out] output Output tensor. Data types supported:F16/F32.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output and conversion policy.
*
* @param[in] compile_context The compile context to be used.
@@ -597,7 +665,11 @@ public:
* @param[out] output Output tensor. Data types supported:F16/F32.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for power
*
* @param[in] input1 First tensor input info. Data types supported: F16/F32.
@@ -607,7 +679,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
index 594ee4cfdc..d186b70d93 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
#define ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h
index c7112dc737..49ecf3c260 100644
--- a/arm_compute/runtime/CL/functions/CLFFT1D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT1D.h
@@ -24,10 +24,9 @@
#ifndef ARM_COMPUTE_CLFFT1D_H
#define ARM_COMPUTE_CLFFT1D_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/MemoryGroup.h"
namespace arm_compute
@@ -82,7 +81,10 @@ public:
* @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
* @param[in] config FFT related configuration
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const FFT1DInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D.
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h
index 3d20327bf1..b7d15f1602 100644
--- a/arm_compute/runtime/CL/functions/CLFFT2D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT2D.h
@@ -24,11 +24,10 @@
#ifndef ARM_COMPUTE_CLFFT2D_H
#define ARM_COMPUTE_CLFFT2D_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLFFT1D.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/MemoryGroup.h"
namespace arm_compute
@@ -79,7 +78,10 @@ public:
* @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
* @param[in] config FFT related configuration
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const FFT2DInfo &config);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFT2D.
*
* @param[in] input Source tensor info. Data types supported: F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
index f873cb0b86..ed78bbb7a7 100644
--- a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
@@ -24,8 +24,6 @@
#ifndef ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
#define ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
@@ -37,6 +35,7 @@
#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
#include "arm_compute/runtime/CL/functions/CLReverse.h"
#include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/runtime/IFunction.h"
namespace arm_compute
{
@@ -94,8 +93,13 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
/** Set the input and output tensors.
*
* @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -113,8 +117,14 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
/** Static function to check if given info will lead to a valid configuration of @ref CLFFTConvolutionLayer
*
* @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -133,8 +143,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h
index a01e0c3188..be1059761a 100644
--- a/arm_compute/runtime/CL/functions/CLFill.h
+++ b/arm_compute/runtime/CL/functions/CLFill.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,11 @@
#ifndef ARM_COMPUTE_CLFILL_H
#define ARM_COMPUTE_CLFILL_H
+#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
@@ -72,7 +74,10 @@ public:
* @param[in] constant_value The value used to fill the planes of the tensor
* @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ const PixelValue &constant_value,
+ Window *window = nullptr);
/** Static function to check if given info will lead to a valid configuration of @ref CLFill
*
* @param[in] tensor Source tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
deleted file mode 100644
index 20f2e15b72..0000000000
--- a/arm_compute/runtime/CL/functions/CLFillBorder.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFILLBORDER_H
-#define ARM_COMPUTE_CLFILLBORDER_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLFillBorderKernel */
-class CLFillBorder : public ICLSimpleFunction
-{
-public:
- /** Initialize the function
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src |dst |
- * |:--------------|:--------------|
- * |All |All |
- *
- * @param[in,out] tensor Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
- * @param[in] border_width The border width
- * @param[in] border_mode Strategy to use for borders.
- * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
- */
- void configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in,out] tensor Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
- * @param[in] border_width The border width
- * @param[in] border_mode Strategy to use for borders.
- * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
- */
- void configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-};
-}
-#endif /*ARM_COMPUTE_FILLBORDER_H */
diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h
index 87cd5b44c7..4d3d704857 100644
--- a/arm_compute/runtime/CL/functions/CLFloor.h
+++ b/arm_compute/runtime/CL/functions/CLFloor.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLFLOOR_H
#define ARM_COMPUTE_CLFLOOR_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 075c5d1f45..9fd0b4aaef 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,81 +24,19 @@
#ifndef ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H
#define ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IWeightsManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
namespace arm_compute
{
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref CLTranspose */
-class CLFullyConnectedLayerReshapeWeightsManaged : public ITransformWeights
-{
-public:
- //Inherited method override
- void run() override
- {
- _output.allocator()->allocate();
- _func.run();
- _reshape_run = true;
- }
-
- //Inherited method override
- void release() override
- {
- _output.allocator()->free();
- }
-
- //Inherited method override
- ICLTensor *get_weights() override
- {
- return &_output;
- }
-
- //Inherited method override
- uint32_t uid() override
- {
- return _uid;
- }
-
- /** Configures the @ref CLTranspose function
- *
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- */
- void configure(const ICLTensor *input)
- {
- configure(CLKernelLibrary::get().get_compile_context(), input);
- }
- /** Configures the @ref CLTranspose function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input)
- {
- _func.configure(compile_context, input, &_output);
- }
-
-private:
- static constexpr uint32_t _uid = 0x0;
- CLTensor _output{};
- CLTranspose _func{};
-};
-} // namespace weights_transformations
-
/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
*
- * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
+ * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
* -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- * -# @ref opencl::kernels::ClGemmMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ * -# @ref opencl::ClGemm or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
*
* @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
*/
@@ -106,7 +44,10 @@ class CLFullyConnectedLayer : public IFunction
{
public:
/** Constructor */
- CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+ CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr,
+ IWeightsManager *weights_manager = nullptr);
+ /** Default destructor */
+ ~CLFullyConnectedLayer();
/** Prevent instances of this class from being copied (As this class contains pointers) */
CLFullyConnectedLayer(const CLFullyConnectedLayer &) = delete;
/** Default move constructor */
@@ -129,22 +70,6 @@ public:
* |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
* |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
*
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. The weights must be 2 dimensional.
- * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
- * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
- * Data type supported: Same as @p input.
- * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input.
- * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
- * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
- * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
- * Data type supported: Same as @p input.
- * @param[in] fc_info (Optional) Fully connected layer additional info
- */
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
- /** Set the input and output tensors.
- *
* @param[in] compile_context The compile context to be used.
* @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
* @param[in] weights Weights tensor. The weights must be 2 dimensional.
@@ -158,25 +83,31 @@ public:
* Data type supported: Same as @p input.
* @param[in] fc_info (Optional) Fully connected layer additional info
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+ /** Set the input and output tensors.
+ *
+ * Similar to @ref CLFullyConnectedLayer
+ */
+ void configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor info. The weights must be 2 dimensional.
- * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
- * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
- * Data type supported: Same as @p input.
- * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
- * @param[out] output Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
- * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
- * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
- * Data type supported: Same as @p input.
- * @param[in] fc_info (Optional) Fully connected layer additional info
+ * Similar to @ref CLFullyConnectedLayer
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
//Inherited methods override
@@ -184,28 +115,8 @@ public:
void prepare() override;
private:
- void configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
- void configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
- void configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
-
- MemoryGroup _memory_group;
- IWeightsManager *_weights_manager;
- CLConvertFullyConnectedWeights _convert_weights;
- weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed;
- weights_transformations::CLFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function;
- CLFlattenLayer _flatten_layer;
- CLTranspose _reshape_weights_function;
- CLGEMM _mm_gemm;
- CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
- CLTensor _flatten_output;
- CLTensor _converted_weights_output;
- CLTensor _reshape_weights_output;
- bool _are_weights_converted;
- bool _are_weights_reshaped;
- bool _is_fc_after_conv;
- bool _is_quantized;
- bool _is_prepared;
- const ICLTensor *_original_weights;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
index cd75270392..2e777273cd 100644
--- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
+++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
@@ -78,9 +78,16 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to Convolution.
*/
- void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -97,9 +104,17 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to Convolution.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias = nullptr,
+ const ICLTensor *bn_beta = nullptr,
+ const ICLTensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalization
*
* @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -117,10 +132,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ static Status validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias = nullptr,
+ const ITensorInfo *bn_beta = nullptr,
+ const ITensorInfo *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 38a07ef9fb..f5e6aa1237 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#ifndef ARM_COMPUTE_CLGEMM_H
#define ARM_COMPUTE_CLGEMM_H
+#include "arm_compute/function_info/GEMMInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTypes.h"
#include "arm_compute/runtime/IFunction.h"
@@ -77,6 +78,9 @@ public:
*
* @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
*
+ * @note Batched GEMM only allows RHS tensor's rank to be <= 3
+ * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+ *
* @param[in] compile_context The compile context to be used.
* @param[in] a First input tensor (Matrix or Vector A). Data types supported: F16/F32
* @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a.
@@ -88,13 +92,26 @@ public:
* if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
* in case matrix A and matrix B have been already transformed.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Initialise the kernel's inputs and output
*
* Similar to @ref CLGEMM::configure()
*/
- void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMM.
*
@@ -102,7 +119,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 082b481047..70ceb1513b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,157 +21,28 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/ITransformWeights.h"
#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
#include <memory>
namespace arm_compute
{
-class CLCol2ImKernel;
-class CLIm2ColKernel;
-class CLWeightsReshapeKernel;
+// Forward declarations
+class CLCompileContext;
class ICLTensor;
-
-/** Function to reshape and transpose the weights. This function calls the following kernels:
- * -# @ref CLWeightsReshapeKernel
- */
-class CLConvolutionLayerReshapeWeights : public IFunction
-{
-public:
- /** Constructor */
- CLConvolutionLayerReshapeWeights();
- /** Prevent instances of this class from being copied */
- CLConvolutionLayerReshapeWeights(const CLConvolutionLayerReshapeWeights &) = delete;
- /** Prevent instances of this class from being copied */
- CLConvolutionLayerReshapeWeights &operator=(const CLConvolutionLayerReshapeWeights &) = delete;
- /** Default move constructor */
- CLConvolutionLayerReshapeWeights(CLConvolutionLayerReshapeWeights &&) = default;
- /** Default move assignment operator */
- CLConvolutionLayerReshapeWeights &operator=(CLConvolutionLayerReshapeWeights &&) = default;
- /** Default destructor */
- ~CLConvolutionLayerReshapeWeights();
- /** Set the input and output tensors.
- *
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
- * @param[out] output Destination tensor. Data types supported: Same as @p weights.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- */
- void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
- /** Set the input and output tensors.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
- * @param[out] output Destination tensor. Data types supported: Same as @p weights.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
- /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayerReshapeWeights
- *
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
- * @param[in] output Destination tensor. Data types supported: Same as @p weights.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
- // Inherited methods overridden:
- void run() override;
-
-private:
- std::unique_ptr<CLWeightsReshapeKernel> _weights_reshape_kernel;
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref CLConvolutionLayerReshapeWeights */
-class CLConvolutionLayerReshapeWeightsTransform : public ITransformWeights
-{
-public:
- /** Configures the @ref CLConvolutionLayerReshapeWeights function
- *
- * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
- * @param[in] biases Biases tensor. Data type supported: same as @p input, S32 if @p input is quantized.
- * @param[in] num_groups Number of groups when performing a grouped convolution.
- */
- void configure(const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups)
- {
- configure(CLKernelLibrary::get().get_compile_context(), input, biases, num_groups);
- }
- /** Configures the @ref CLConvolutionLayerReshapeWeights function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
- * @param[in] biases Biases tensor. Data type supported: same as @p input, S32 if @p input is quantized.
- * @param[in] num_groups Number of groups when performing a grouped convolution.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups)
- {
- _bias_bit = (biases != nullptr) ? 1 : 0;
- _num_groups = num_groups;
- _func.configure(compile_context, input, biases, &_output, num_groups);
- }
-
- //Inherited method override
- void run() override
- {
- _output.allocator()->allocate();
- _func.run();
- _reshape_run = true;
- }
-
- //Inherited method override
- ICLTensor *get_weights() override
- {
- return &_output;
- }
-
- //Inherited method override
- void release() override
- {
- _output.allocator()->free();
- }
-
- //Inherited method override
- uint32_t uid() override
- {
- return ((0x9) | (_bias_bit << 7) | (_num_groups << 8));
- }
-
-private:
- CLTensor _output{};
- CLConvolutionLayerReshapeWeights _func{};
- int32_t _bias_bit{ 0 };
- unsigned int _num_groups{ 0 };
-};
-} // namespace weights_transformations
+class ITensorInfo;
/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
*
- * -# @ref CLIm2ColKernel
- * -# @ref CLGEMM (if the data type is FP32 or FP16)
- * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref CLGEMMLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref CLCol2ImKernel (if NCHW data layout)
+ * -# @ref opencl::ClGemmConv2d
*/
class CLGEMMConvolutionLayer : public IFunction
{
@@ -181,7 +52,8 @@ public:
* @param[in] memory_manager (Optional) Memory manager.
* @param[in] weights_manager (Optional) Weights manager.
*/
- CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+ CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr,
+ IWeightsManager *weights_manager = nullptr);
/** Prevent instances of this class from being copied (As this class contains pointers) */
CLGEMMConvolutionLayer(const CLGEMMConvolutionLayer &) = delete;
/** Default move constructor */
@@ -224,8 +96,15 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
*/
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+ void configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ unsigned int num_groups = 1);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -245,9 +124,16 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -268,70 +154,23 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ unsigned int num_groups = 1);
// Inherited methods overridden:
void run() override;
void prepare() override;
private:
- /** Configures the appropriate matrix multiply routine
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
- * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
- * @param[in, out] output Output tensor. Data types supported: same as @p input.
- * @param[in] gemmlowp_output_stage GEMMLowp output stage info
- * @param[in] gemm_3d_depth Depth of GEMM 3D
- * @param[in] act_info Activation to apply after the matrix multiplication
- */
- void configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, const ActivationLayerInfo &act_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
- *
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
- * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
- * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
- * @param[in] output Output tensor info. Data types supported: same as @p input.
- * @param[in] gemmlowp_output_stage GEMMLowp output stage info
- * @param[in] gemm_3d_depth Depth of GEMM 3D
- * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.
- * @param[in] act_info Activation to apply after the matrix multiplication
- *
- * @return a status
- */
- static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
-
-private:
- MemoryGroup _memory_group;
- IWeightsManager *_weights_manager;
- CLConvolutionLayerReshapeWeights _reshape_weights;
- weights_transformations::CLConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
- std::unique_ptr<CLIm2ColKernel> _im2col_kernel;
- CLGEMM _mm_gemm;
- CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
- std::unique_ptr<CLCol2ImKernel> _col2im_kernel;
- CLActivationLayer _activationlayer_function;
-
- const ICLTensor *_original_weights;
-
- CLTensor _im2col_output;
- CLTensor _weights_reshaped;
- CLTensor _gemm_output;
-
- bool _skip_im2col;
- bool _skip_col2im;
- bool _is_quantized;
- bool _fuse_activation;
- bool _is_prepared;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index 6e482c98e7..3e8929c5ad 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
@@ -26,6 +26,8 @@
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
#include "arm_compute/runtime/CL/functions/CLPermute.h"
#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
@@ -111,7 +113,11 @@ public:
* @param[out] output Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
*/
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+ void configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info);
/** Set the input, weights, biases and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -122,7 +128,12 @@ public:
* @param[out] output Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
*
* @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -134,7 +145,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &deconv_info);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 3d2dbdb104..1b8e5dcc1d 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,19 @@
#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
+#include "arm_compute/function_info/GEMMInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include <memory>
+
namespace arm_compute
{
class CLCompileContext;
class IMemoryManager;
class ICLTensor;
class ITensorInfo;
-class CLGEMMLowpMatrixMultiplyNativeKernel;
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel;
-class CLGEMMLowpOffsetContributionKernel;
-class CLGEMMLowpOffsetContributionOutputStageKernel;
-class CLGEMMLowpMatrixAReductionKernel;
-class CLGEMMLowpMatrixBReductionKernel;
-namespace opencl
-{
-namespace kernels
-{
-class ClGemmReshapeRhsMatrixKernel;
-} // namespace kernels
-} // namespace opencl
-
-namespace opencl
-{
-namespace kernels
-{
-class ClCastKernel;
-} // namespace kernels
-} // namespace opencl
/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
class CLGEMMLowpMatrixMultiplyCore : public IFunction
@@ -109,7 +91,11 @@ public:
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should be executed only for the first run
*/
- void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Initialise the kernel's inputs, output
*
* @note GEMMLowp: low precision GEMM kernel. [A * B + C]
@@ -128,7 +114,12 @@ public:
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should be executed only for the first run
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
*
* @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8.
@@ -140,47 +131,19 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden:
void run() override;
void prepare() override;
private:
- MemoryGroup _memory_group;
-
- // Kernels used
- std::unique_ptr<opencl::kernels::ClCastKernel> _weights_to_qasymm8;
- std::unique_ptr<CLGEMMLowpMatrixMultiplyNativeKernel> _mm_native_kernel;
- std::unique_ptr<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel> _mm_reshaped_only_rhs_kernel;
- std::unique_ptr<opencl::kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _mtx_a_reduction_kernel;
- std::unique_ptr<CLGEMMLowpMatrixBReductionKernel> _mtx_b_reduction_kernel;
- std::unique_ptr<CLGEMMLowpOffsetContributionKernel> _offset_contribution_kernel;
- std::unique_ptr<CLGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
-
- // Temporary tensors
- CLTensor _qasymm8_weights;
- CLTensor _vector_sum_col;
- CLTensor _vector_sum_row;
- CLTensor _tmp_b;
- CLTensor _mm_result_s32;
- CLTensor _gemm_output_stage_multipliers;
- CLTensor _gemm_output_stage_shifts;
-
- // Tensor pointers
- const ICLTensor *_matrix_a;
- const ICLTensor *_original_b;
- const ICLTensor *_output;
-
- int32_t _a_offset;
- int32_t _b_offset;
- bool _is_gemm_reshaped;
- bool _reshape_b_only_on_first_run;
- bool _is_prepared;
- bool _run_output_stage;
- bool _convert_to_qasymm8;
- bool _run_offset_contribution;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H */ \ No newline at end of file
+#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index a60992a0f4..ff9c872896 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -45,247 +45,28 @@ class ICLTensor;
class ITensorInfo;
struct GEMMLowpOutputStageInfo;
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
- *
- * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Initialise the kernel's inputs, output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on OpenCL.
- *
- * CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Initialise the kernel's inputs, output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL.
- *
- * CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- * result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following CL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QSYMM16
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
- int max = std::numeric_limits<int32_t>::max());
- /** Initialise the kernel's inputs, output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QSYMM16
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint
- *
- * @param[in] input Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor info. Data type supported: QSYMM16
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
/** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
*
* This function calls the following CL kernels:
*
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
*/
-class CLGEMMLowpOutputStage : public ICLSimpleFunction
+class CLGEMMLowpOutputStage : public IFunction
{
public:
+ CLGEMMLowpOutputStage();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMLowpOutputStage(const CLGEMMLowpOutputStage &) = delete;
+ /** Default move constructor */
+ CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMLowpOutputStage &operator=(const CLGEMMLowpOutputStage &) = delete;
+ /** Default move assignment operator */
+ CLGEMMLowpOutputStage &operator=(CLGEMMLowpOutputStage &&);
+ /** Default destructor */
+ ~CLGEMMLowpOutputStage();
/** Initialise the kernel's inputs, output
*
* Valid data layouts:
@@ -304,7 +85,8 @@ public:
* @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
* @param[in] info GEMMLowp output stage metadata.
*/
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
+ void
+ configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
/** Initialise the kernel's inputs, output
*
* @param[in] compile_context The compile context to be used.
@@ -314,8 +96,12 @@ public:
* @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
* @param[in] info GEMMLowp output stage metadata.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const GEMMLowpOutputStageInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
*
* @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
* @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
@@ -325,7 +111,17 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const GEMMLowpOutputStageInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
index 7a57c7358c..360c8757b6 100644
--- a/arm_compute/runtime/CL/functions/CLGather.h
+++ b/arm_compute/runtime/CL/functions/CLGather.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,7 @@ public:
* |All |All |
*
* @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All.
- * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
* @param[out] output Destination tensor. Data type supported: Same as @p input
* @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
*/
@@ -58,22 +58,27 @@ public:
*
* @param[in] compile_context The compile context to be used.
* @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All.
- * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
* @param[out] output Destination tensor. Data type supported: Same as @p input
* @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
*
* @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: All.
- * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+ * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
* @param[in] output Destination tensor info. Data type supported: Same as @p input
* @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLGATHER_H */
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index aec5cdf1a8..3a201e79b0 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -100,7 +100,12 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
* @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
*/
- void configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+ void configure(const ICLTensor *scores,
+ const ICLTensor *deltas,
+ const ICLTensor *anchors,
+ ICLTensor *proposals,
+ ICLTensor *scores_out,
+ ICLTensor *num_valid_proposals,
const GenerateProposalsInfo &info);
/** Set the input and output tensors.
*
@@ -118,8 +123,14 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
* @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out,
- ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *scores,
+ const ICLTensor *deltas,
+ const ICLTensor *anchors,
+ ICLTensor *proposals,
+ ICLTensor *scores_out,
+ ICLTensor *num_valid_proposals,
+ const GenerateProposalsInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLGenerateProposalsLayer
*
@@ -135,7 +146,11 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+ static Status validate(const ITensorInfo *scores,
+ const ITensorInfo *deltas,
+ const ITensorInfo *anchors,
+ const ITensorInfo *proposals,
+ const ITensorInfo *scores_out,
const ITensorInfo *num_valid_proposals,
const GenerateProposalsInfo &info);
diff --git a/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
new file mode 100644
index 0000000000..91952af5dc
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H
+#define ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+/** Basic function to run the indirect convolution function
+ */
+class CLIndirectConvolutionLayer : public IFunction
+{
+public:
+ /** Constructor */
+ CLIndirectConvolutionLayer();
+ /** Destructor */
+ ~CLIndirectConvolutionLayer();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLIndirectConvolutionLayer(const CLIndirectConvolutionLayer &) = delete;
+ /** Default move constructor */
+ CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLIndirectConvolutionLayer &operator=(const CLIndirectConvolutionLayer &) = delete;
+ /** Default move assignment operator */
+ CLIndirectConvolutionLayer &operator=(CLIndirectConvolutionLayer &&);
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:--------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ *
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input,
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type.
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output, while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input,
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type.
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output, while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref CLIndirectConvolutionLayer
+ *
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input,
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type.
+ * @param[in] output Destination tensor. 3 lower dimensions represent a single output, while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
index 985a6a75f7..98d215dd4b 100644
--- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
@@ -83,7 +83,12 @@ public:
* @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
* @param[in] use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
*/
- void configure(ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+ void configure(ICLTensor *input,
+ ICLTensor *output,
+ float gamma = 1.0f,
+ float beta = 0.0f,
+ float epsilon = 1e-12f,
+ bool use_mixed_precision = true);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -95,7 +100,13 @@ public:
* @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
* @param[in] use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float gamma = 1.0f,
+ float beta = 0.0f,
+ float epsilon = 1e-12f,
+ bool use_mixed_precision = true);
/** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
*
@@ -108,8 +119,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
- void run() override;
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ float gamma = 1.0f,
+ float beta = 0.0f,
+ float epsilon = 1e-12f,
+ bool use_mixed_precision = true);
+ void run() override;
private:
std::unique_ptr<ICLKernel> _inst_norm_kernel; /**< Kernel to run */
diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index 4dc5c778d2..a8b356a708 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -26,8 +26,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
@@ -89,7 +89,8 @@ public:
* @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
* @param[in] epsilon (Optional) Lower bound value for the normalization.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
+ void configure(
+ const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
/** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayer.
*
@@ -111,5 +112,5 @@ private:
std::unique_ptr<CLL2NormalizeLayerKernel> _normalize_kernel;
CLTensor _sumsq;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_CLL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index d26b4c5595..fe494991af 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -24,8 +24,6 @@
#ifndef ARM_COMPUTE_CLLSTMLAYER_H
#define ARM_COMPUTE_CLLSTMLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
@@ -37,9 +35,10 @@
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
#include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
#include <memory>
@@ -53,7 +52,7 @@ namespace kernels
{
class ClTransposeKernel;
}
-}
+} // namespace opencl
/** This function performs a single time step in a Long Short-Term Memory (LSTM) layer.
*
@@ -120,13 +119,26 @@ public:
* @param[in] projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
* If set to 0.0f then clipping is disabled.
*/
- void configure(const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- const ICLTensor *output_state_in, ICLTensor *cell_state_in,
- ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
- const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+ void configure(const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_in,
+ ICLTensor *scratch_buffer,
+ ICLTensor *output_state_out,
+ ICLTensor *cell_state_out,
+ ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold = 0.f,
+ float projection_threshold = 0.f);
/** Initialize function's tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -166,13 +178,27 @@ public:
* @param[in] projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
* If set to 0.0f then clipping is disabled.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- const ICLTensor *output_state_in, ICLTensor *cell_state_in,
- ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
- const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_in,
+ ICLTensor *scratch_buffer,
+ ICLTensor *output_state_out,
+ ICLTensor *cell_state_out,
+ ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold = 0.f,
+ float projection_threshold = 0.f);
/** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayer
*
@@ -214,13 +240,26 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
- const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
- const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *scratch_buffer,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold = 0.f,
+ float projection_threshold = 0.f);
// Inherited methods overridden:
void run() override;
@@ -311,7 +350,7 @@ private:
bool _perform_projection_clipping;
bool _is_prepared;
bool _is_layer_norm_lstm;
- const ICLTensor *_recurrent_to_cell_weights{ nullptr };
+ const ICLTensor *_recurrent_to_cell_weights{nullptr};
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 2ef7427a5a..8c116b1482 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -35,7 +35,6 @@
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLSlice.h"
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
#include "arm_compute/runtime/common/LSTMParams.h"
namespace arm_compute
@@ -47,16 +46,16 @@ class ICLTensor;
*
* This function calls the following CL functions/kernels:
*
- * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
- * -# @ref CLTranspose Matrix transpose
- * -# @ref CLConcatenateLayer Tensor concatenation
- * -# @ref CLActivationLayer Activation functions (tanh and logistic)
- * -# @ref CLArithmeticAddition Elementwise addition
- * -# @ref CLPixelWiseMultiplication Elementwise multiplication
- * -# @ref CLSlice Tensor slicing
- * -# @ref CLDequantizationLayer Dequantize into float
- * -# @ref CLQuantizationLayer Quantize from float
+ * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
+ * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
+ * -# @ref CLTranspose Matrix transpose
+ * -# @ref CLConcatenateLayer Tensor concatenation
+ * -# @ref CLActivationLayer Activation functions (tanh and logistic)
+ * -# @ref CLArithmeticAddition Elementwise addition
+ * -# @ref CLPixelWiseMultiplication Elementwise multiplication
+ * -# @ref CLSlice Tensor slicing
+ * -# @ref CLDequantizationLayer Dequantize into float
+ * -# @ref CLQuantizationLayer Quantize from float
* */
class CLLSTMLayerQuantized : public IFunction
{
@@ -100,11 +99,22 @@ public:
* @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
*/
void configure(const ICLTensor *input,
- const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, const ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out);
+ const ICLTensor *input_to_input_weights,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out);
/** Initialize function's tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -126,12 +136,24 @@ public:
* @param[out] cell_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported: QSYMM16.
* @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, const ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_input_weights,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out);
/** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayerQuantized
*
@@ -156,11 +178,22 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input,
- const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+ const ITensorInfo *input_to_input_weights,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_input_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *input_gate_bias,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out);
// Inherited methods overridden:
void run() override;
@@ -170,30 +203,30 @@ private:
MemoryGroup _memory_group;
// Functions used
- CLGEMMLowpMatrixMultiplyCore _gemmlowp;
- CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
- CLTranspose _transpose_weights;
- CLConcatenateLayer _concat_input_weights;
- CLConcatenateLayer _concat_recurrent_weights;
- CLConcatenateLayer _concat_weights;
- CLConcatenateLayer _concat_inputs;
- CLConcatenateLayer _concat_bias;
- CLActivationLayer _sigmoid_forget_gate;
- CLActivationLayer _sigmoid_input_gate;
- CLActivationLayer _sigmoid_output_gate;
- CLActivationLayer _tanh_modulation_gate;
- CLActivationLayer _tanh_output_state;
- CLArithmeticAddition _add_cell_state_tmps;
- CLArithmeticAddition _add2;
- CLPixelWiseMultiplication _mul_forget_gate_cell_state;
- CLPixelWiseMultiplication _mul_input_gate_input_mod_gate;
- CLPixelWiseMultiplication _mul_output_state_tmp_output_gate;
- CLSlice _slice_input_tensor;
- CLSlice _slice_forget_tensor;
- CLSlice _slice_cell_tensor;
- CLSlice _slice_output_tensor;
- CLDequantizationLayer _dequantize;
- CLQuantizationLayer _quantize;
+ CLGEMMLowpMatrixMultiplyCore _gemmlowp;
+ CLGEMMLowpOutputStage _output_stage;
+ CLTranspose _transpose_weights;
+ CLConcatenateLayer _concat_input_weights;
+ CLConcatenateLayer _concat_recurrent_weights;
+ CLConcatenateLayer _concat_weights;
+ CLConcatenateLayer _concat_inputs;
+ CLConcatenateLayer _concat_bias;
+ CLActivationLayer _sigmoid_forget_gate;
+ CLActivationLayer _sigmoid_input_gate;
+ CLActivationLayer _sigmoid_output_gate;
+ CLActivationLayer _tanh_modulation_gate;
+ CLActivationLayer _tanh_output_state;
+ CLArithmeticAddition _add_cell_state_tmps;
+ CLArithmeticAddition _add2;
+ CLPixelWiseMultiplication _mul_forget_gate_cell_state;
+ CLPixelWiseMultiplication _mul_input_gate_input_mod_gate;
+ CLPixelWiseMultiplication _mul_output_state_tmp_output_gate;
+ CLSlice _slice_input_tensor;
+ CLSlice _slice_forget_tensor;
+ CLSlice _slice_cell_tensor;
+ CLSlice _slice_output_tensor;
+ CLDequantizationLayer _dequantize;
+ CLQuantizationLayer _quantize;
// Tensor pointers
const ICLTensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/CL/functions/CLLogicalAnd.h b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
index e3061e1dc3..4ff488782a 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalAnd.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
@@ -111,7 +111,8 @@ public:
* The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
* @param[out] output Output tensor. Data types supported: same as @p input1.
*/
- void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+ void
+ configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
*
* @param[in] input1 First tensor input info. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLLogicalNot.h b/arm_compute/runtime/CL/functions/CLLogicalNot.h
index 27fd0f9c9f..c7d9db93d7 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalNot.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLLOGICALNOT_H
#define ARM_COMPUTE_CLLOGICALNOT_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -85,7 +84,7 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output);
- void run() override;
+ void run() override;
private:
struct Impl;
diff --git a/arm_compute/runtime/CL/functions/CLLogicalOr.h b/arm_compute/runtime/CL/functions/CLLogicalOr.h
index 893c22f721..64b6d83177 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalOr.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalOr.h
@@ -111,7 +111,8 @@ public:
* The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
* @param[out] output Output tensor. Data types supported: same as @p input1.
*/
- void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+ void
+ configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
*
* @param[in] input1 First tensor input info. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
new file mode 100644
index 0000000000..9c9939b9d0
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations for used types instead of including their header, that could minimize compile time
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+class MatMulInfo;
+class Status;
+
+/** Settings for MatMul OpenCL implementation */
+class GpuMatMulSettings
+{
+public:
+ /* Placeholder for operator parity between CPU/GPU */
+};
+
+/** Basic function to execute MatMul (Matrix Multiplication) on OpenCL */
+class CLMatMul : public IFunction
+{
+public:
+ /** Default constructor.*/
+ CLMatMul();
+ /** Default destructor */
+ ~CLMatMul();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMatMul(const CLMatMul &) = delete;
+ /** Default move constructor */
+ CLMatMul(CLMatMul &&);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMatMul &operator=(const CLMatMul &) = delete;
+ /** Default move assignment operator */
+ CLMatMul &operator=(CLMatMul &&);
+ /** Initialise the kernel's inputs and output
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |lhs |rhs |dst |
+ * |:--------------|:--------------|:--------------|
+ * |F32 |F32 |F32 |
+ * |F16 |F16 |F16 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ * |QASYMM8 |QASYMM8 |QASYMM8 |
+ *
+ * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
+ * and stores the result in the dst tensor of the same batch size.
+ * Batch here is number of slices from A and B multiplied at a time, do not confuse with the batch dimension 'N' of NHWC/NCHW
+ * For NHWC for example: the batch is the higher dimensions H * N, and in general it is H * all higher dimensions.
+ * @note All tensors must have the same data type.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Left-hand side tensor info containing the input activations as Matrix A. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+ * @param[in] rhs Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs.
+ * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+ * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] settings Contains flags for function level settings
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+ */
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *rhs,
+ ICLTensor *lhs,
+ ICLTensor *dst,
+ const MatMulInfo &matmul_info,
+ const GpuMatMulSettings &settings = GpuMatMulSettings{},
+ const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+ /** Initialise the kernel's inputs and output
+ *
+ * Similar to @ref CLMatMul::configure()
+ */
+ void configure(ICLTensor *lhs,
+ ICLTensor *rhs,
+ ICLTensor *dst,
+ const MatMulInfo &matmul_info,
+ const GpuMatMulSettings &settings = GpuMatMulSettings{},
+ const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+ /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
+ *
+ *
+ * @note All tensors must have the same data type.
+ *
+ * @param[in] lhs Left-hand side (Matrix A) tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+ * @param[in] rhs Right-hand side (Matrix B) tensor info. Data types supported: same as @p lhs.
+ * @param[out] output Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+ * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+ */
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *output,
+ const MatMulInfo &matmul_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+
+#endif /* ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */
diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
index f7ff1234f6..2d2f064b4c 100644
--- a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
@@ -92,7 +92,11 @@ public:
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayer
*
* @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -105,7 +109,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run() override;
@@ -114,5 +121,5 @@ private:
CLFill _fill;
std::unique_ptr<CLMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
index 68a7df24e6..951db3e419 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
@@ -65,7 +65,10 @@ public:
* @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
* @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output = nullptr,
+ float epsilon = 1e-8f);
/** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
*
* @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 15406f7728..10fd8ed4c6 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -89,7 +89,10 @@ public:
* Data types supported: same as @p input. Data layouts supported: same as @p input.
* @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const NormalizationLayerInfo &norm_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -100,7 +103,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
index de5155c65a..cdccc16a51 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
-#define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
@@ -44,6 +44,18 @@ class CLNormalizePlanarYUVLayer : public ICLSimpleFunction
public:
/** Set the input and output tensors.
*
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:--------------|
+ * |F32 |F32 |
+ * |F16 |F16 |
+ * |QASYMM8 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ *
* @param[in] input Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
* Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
* @param[out] output Destinationfeature tensor. Data type supported: same as @p input
@@ -62,7 +74,11 @@ public:
* @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
* Data types supported: Same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std);
/** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayer
*
* @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -74,7 +90,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index 7f950bcfb3..89e693bd92 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -76,7 +76,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
/** Initialize the function
*
* @param[in] compile_context The compile context to be used.
@@ -88,8 +92,12 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
- PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
/** Static function to check if given info will lead to a valid configuration of @ref CLPadLayer.
*
@@ -101,7 +109,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h
index 8e15da2287..7ac0bf6b9c 100644
--- a/arm_compute/runtime/CL/functions/CLPermute.h
+++ b/arm_compute/runtime/CL/functions/CLPermute.h
@@ -78,7 +78,10 @@ public:
* @param[in] output The output tensor. Data types supported: Same as @p input
* @param[in] perm Permutation vector
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PermutationVector &perm);
/** Static function to check if given info will lead to a valid configuration of @ref CLPermute.
*
* @note Arbitrary permutation vectors are supported with rank not greater than 4
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index d352c6e282..f3e5cf9bd3 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
#define ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
+#include "arm_compute/core/Rounding.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/ICLOperator.h"
#include "arm_compute/runtime/IFunction.h"
@@ -82,8 +84,13 @@ public:
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output and convertion policy.
*
* @param[in] compile_context The compile context to be used.
@@ -98,8 +105,14 @@ public:
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
*
* @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
@@ -113,8 +126,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -149,7 +167,10 @@ public:
* @param[out] output The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Initialise the kernel's inputs, output.
*
* @param[in] compile_context The compile context to be used.
@@ -160,7 +181,11 @@ public:
* @param[out] output The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplication
*
* @param[in] input1 An input tensor info. Data types supported: F16/F32. Number of channels supported: 2.
@@ -168,7 +193,10 @@ public:
* @param[in] output The output tensor info, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
new file mode 100644
index 0000000000..1c69148771
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPOOLING3DLAYER_H
+#define ARM_COMPUTE_CLPOOLING3DLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+/** Basic function to run @ref opencl::ClPool3d */
+class CLPooling3dLayer : public IFunction
+{
+public:
+ /** Default Constructor */
+ CLPooling3dLayer();
+ /** Default Destructor */
+ ~CLPooling3dLayer();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPooling3dLayer(const CLPooling3dLayer &) = delete;
+ /** Default move constructor */
+ CLPooling3dLayer(CLPooling3dLayer &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPooling3dLayer &operator=(const CLPooling3dLayer &) = delete;
+ /** Default move assignment operator */
+ CLPooling3dLayer &operator=(CLPooling3dLayer &&) = default;
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NDHWC
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:--------------|
+ * |F16 |F16 |
+ * |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ *
+ * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+ * Cases where pooling region is completely outside input tensor are not supported
+ *
+ * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
+ *
+ * @param[in,out] input Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info);
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] input Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Pooling3dLayerInfo &pool_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLPooling3dLayer
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] output Destination tensor info. Data types supported: Same as @p input.
+ * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 1975e15470..3dbdf8aeea 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CLPOOLINGLAYER_H
#define ARM_COMPUTE_CLPOOLINGLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -66,12 +65,16 @@ public:
* |F16 |F16 |
* |F32 |F32 |
*
+ * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+ * Cases where pooling region is completely outside input tensor are not supported
+ *
* @param[in,out] input Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
* @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
*/
- void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+ void
+ configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -80,7 +83,11 @@ public:
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
* @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info,
+ ICLTensor *indices = nullptr);
/** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
*
* @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -90,7 +97,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices = nullptr);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
index 9b36c9e433..4ede906baa 100644
--- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
@@ -66,7 +66,11 @@ public:
* @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
* @param[in] info Prior box layer info.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayer
*
* @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -76,12 +80,15 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info);
private:
cl::Buffer _min;
cl::Buffer _max;
cl::Buffer _aspect_ratios;
};
-} // arm_compute
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index bd00d56468..3e76da086f 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -32,7 +32,6 @@
#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
#include "arm_compute/runtime/common/LSTMParams.h"
namespace arm_compute
@@ -40,9 +39,15 @@ namespace arm_compute
// Forward declarations
class CLCompileContext;
class ICLTensor;
-class CLGEMMLowpMatrixAReductionKernel;
class CLQLSTMLayerNormalizationKernel;
class ITensorInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace opencl
/** Basic function to run @ref CLQLSTMLayer
*
@@ -52,8 +57,8 @@ class ITensorInfo;
* -# @ref CLCopy Copy function for copying output_state_out to output
* -# @ref CLArithmeticAddition Elementwise addition and subtraction
* -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
- * -# @ref CLGEMMLowpMatrixAReductionKernel For precomputing effective biases to use
+ * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
+ * -# @ref opencl::kernels::ClGemmLowpMatrixAReductionKernel For precomputing effective biases to use
* -# @ref CLPixelWiseMultiplication Elementwise multiplication
* -# @ref CLTranspose Transpose function for reshaping the weights
* */
@@ -121,12 +126,21 @@ public:
* projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
* [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
*/
- void configure(const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+ void configure(const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out,
+ ICLTensor *output,
const LSTMParams<ICLTensor> &lstm_params);
/** Initialize function's tensors.
@@ -171,12 +185,22 @@ public:
* projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
* [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out,
+ ICLTensor *output,
const LSTMParams<ICLTensor> &lstm_params);
/** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer
@@ -221,12 +245,21 @@ public:
* [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
* @return a status
*/
- static Status validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *output,
const LSTMParams<ITensorInfo> &lstm_params);
// Inherited methods overridden:
@@ -260,10 +293,18 @@ private:
* @param[in] mm_res_info Tensor info to be used to initialize output stage result tensor.
*
*/
- void configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
- const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res,
- CLTensor *outstage_res, float gemmlowp_scale,
- const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+ void configure_mm(const CLCompileContext &compile_context,
+ CLGEMMLowpMatrixMultiplyCore &mm,
+ CLGEMMLowpOutputStage &outstage,
+ GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ICLTensor *mm_input,
+ const ICLTensor *mm_weights,
+ const ICLTensor *bias,
+ CLTensor *mm_res,
+ CLTensor *outstage_res,
+ float gemmlowp_scale,
+ const TensorInfo &mm_res_info,
+ const TensorInfo &outstage_tensor_info);
MemoryGroup _memory_group{};
@@ -272,8 +313,8 @@ private:
{
static constexpr uint32_t max_dimension_supported = 2;
- ICLTensor *_src{ nullptr };
- ICLTensor *_dst{ nullptr };
+ ICLTensor *_src{nullptr};
+ ICLTensor *_dst{nullptr};
size_t _row_size{};
Window _window{};
@@ -297,72 +338,72 @@ private:
};
// Functions used
- CLTranspose _transpose_input_to_forget_weights{};
- CLTranspose _transpose_input_to_cell_weights{};
- CLTranspose _transpose_input_to_output_weights{};
- CLTranspose _transpose_input_to_input_weights{};
- CLTranspose _transpose_recurrent_to_forget_weights{};
- CLTranspose _transpose_recurrent_to_cell_weights{};
- CLTranspose _transpose_recurrent_to_output_weights{};
- CLTranspose _transpose_recurrent_to_input_weights{};
- CLTranspose _transpose_projection_weights{};
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _projection_reduction;
- CLArithmeticAddition _projection_bias_add{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{};
- CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{};
- CLGEMMLowpOutputStage _input_to_forget_outstage{};
- CLGEMMLowpOutputStage _recurrent_to_forget_outstage{};
- CLGEMMLowpOutputStage _cell_to_forget_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_forget{};
- CLArithmeticAddition _accumulate_cell_forget{};
- CLActivationLayer _forget_gate_sigmoid{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{};
- CLGEMMLowpOutputStage _input_to_cell_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{};
- CLGEMMLowpOutputStage _recurrent_to_cell_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_modulation{};
- CLActivationLayer _cell_gate_tanh{};
- CLArithmeticSubtraction _input_gate_sub{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{};
- CLGEMMLowpOutputStage _input_to_input_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{};
- CLGEMMLowpOutputStage _recurrent_to_input_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_input{};
- CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{};
- CLGEMMLowpOutputStage _cell_to_input_outstage{};
- CLArithmeticAddition _accumulate_cell_input{};
- CLActivationLayer _input_gate_sigmoid{};
- CLPixelWiseMultiplication _pixelwise_mul_forget_cell{};
- CLPixelWiseMultiplication _pixelwise_mul_input_cell{};
- CLArithmeticAddition _add_forget_cell{};
- CLActivationLayer _cell_clip{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{};
- CLGEMMLowpOutputStage _input_to_output_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{};
- CLGEMMLowpOutputStage _recurrent_to_output_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_output{};
- CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{};
- CLGEMMLowpOutputStage _cell_to_output_outstage{};
- CLArithmeticAddition _accumulate_cell_to_output{};
- CLActivationLayer _output_gate_sigmoid{};
- CLActivationLayer _hidden_tanh{};
- CLPixelWiseMultiplication _pixelwise_mul_hidden{};
- CLGEMMLowpOutputStage _hidden_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_projection{};
- CLGEMMLowpOutputStage _projection_outstage{};
- CLArithmeticAddition _accumulate_projection{};
- CLActivationLayer _projection_clip{};
+ CLTranspose _transpose_input_to_forget_weights{};
+ CLTranspose _transpose_input_to_cell_weights{};
+ CLTranspose _transpose_input_to_output_weights{};
+ CLTranspose _transpose_input_to_input_weights{};
+ CLTranspose _transpose_recurrent_to_forget_weights{};
+ CLTranspose _transpose_recurrent_to_cell_weights{};
+ CLTranspose _transpose_recurrent_to_output_weights{};
+ CLTranspose _transpose_recurrent_to_input_weights{};
+ CLTranspose _transpose_projection_weights{};
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _projection_reduction;
+ CLArithmeticAddition _projection_bias_add{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{};
+ CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{};
+ CLGEMMLowpOutputStage _input_to_forget_outstage{};
+ CLGEMMLowpOutputStage _recurrent_to_forget_outstage{};
+ CLGEMMLowpOutputStage _cell_to_forget_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_forget{};
+ CLArithmeticAddition _accumulate_cell_forget{};
+ CLActivationLayer _forget_gate_sigmoid{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{};
+ CLGEMMLowpOutputStage _input_to_cell_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{};
+ CLGEMMLowpOutputStage _recurrent_to_cell_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_modulation{};
+ CLActivationLayer _cell_gate_tanh{};
+ CLArithmeticSubtraction _input_gate_sub{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{};
+ CLGEMMLowpOutputStage _input_to_input_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{};
+ CLGEMMLowpOutputStage _recurrent_to_input_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_input{};
+ CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{};
+ CLGEMMLowpOutputStage _cell_to_input_outstage{};
+ CLArithmeticAddition _accumulate_cell_input{};
+ CLActivationLayer _input_gate_sigmoid{};
+ CLPixelWiseMultiplication _pixelwise_mul_forget_cell{};
+ CLPixelWiseMultiplication _pixelwise_mul_input_cell{};
+ CLArithmeticAddition _add_forget_cell{};
+ CLActivationLayer _cell_clip{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{};
+ CLGEMMLowpOutputStage _input_to_output_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{};
+ CLGEMMLowpOutputStage _recurrent_to_output_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_output{};
+ CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{};
+ CLGEMMLowpOutputStage _cell_to_output_outstage{};
+ CLArithmeticAddition _accumulate_cell_to_output{};
+ CLActivationLayer _output_gate_sigmoid{};
+ CLActivationLayer _hidden_tanh{};
+ CLPixelWiseMultiplication _pixelwise_mul_hidden{};
+ CLGEMMLowpOutputStage _hidden_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_projection{};
+ CLGEMMLowpOutputStage _projection_outstage{};
+ CLArithmeticAddition _accumulate_projection{};
+ CLActivationLayer _projection_clip{};
std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
- CLCopy _copy_output;
+ CLCopy _copy_output;
TensorCopyKernel _projection_bias_copy{};
TensorCopyKernel _projection_output_to_accumulate_copy{};
@@ -370,21 +411,18 @@ private:
TensorCopyKernel _hidden_to_output_copy{};
// Tensor pointers
- const ICLTensor *_input_to_input_weights
- {
- nullptr
- };
- const ICLTensor *_recurrent_to_input_weights{ nullptr };
- const ICLTensor *_projection_bias{ nullptr };
- const ICLTensor *_input_to_forget_weights{ nullptr };
- const ICLTensor *_input_to_cell_weights{ nullptr };
- const ICLTensor *_input_to_output_weights{ nullptr };
- const ICLTensor *_recurrent_to_forget_weights{ nullptr };
- const ICLTensor *_recurrent_to_cell_weights{ nullptr };
- const ICLTensor *_recurrent_to_output_weights{ nullptr };
- const ICLTensor *_projection_weights{ nullptr };
- std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{ {} };
- std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{ {} };
+ const ICLTensor *_input_to_input_weights{nullptr};
+ const ICLTensor *_recurrent_to_input_weights{nullptr};
+ const ICLTensor *_projection_bias{nullptr};
+ const ICLTensor *_input_to_forget_weights{nullptr};
+ const ICLTensor *_input_to_cell_weights{nullptr};
+ const ICLTensor *_input_to_output_weights{nullptr};
+ const ICLTensor *_recurrent_to_forget_weights{nullptr};
+ const ICLTensor *_recurrent_to_cell_weights{nullptr};
+ const ICLTensor *_recurrent_to_output_weights{nullptr};
+ const ICLTensor *_projection_weights{nullptr};
+ std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{{}};
+ std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{{}};
using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -417,78 +455,78 @@ private:
return *_layer_norms[getGateIndex(g)];
}
- inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in);
+ inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in);
inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
// Temporary tensors
- CLTensor _input_to_forget_weights_transposed{ nullptr };
- CLTensor _input_to_cell_weights_transposed{ nullptr };
- CLTensor _input_to_output_weights_transposed{ nullptr };
- CLTensor _input_to_input_weights_transposed{ nullptr };
- CLTensor _recurrent_to_forget_weights_transposed{ nullptr };
- CLTensor _recurrent_to_cell_weights_transposed{ nullptr };
- CLTensor _recurrent_to_output_weights_transposed{ nullptr };
- CLTensor _recurrent_to_input_weights_transposed{ nullptr };
- CLTensor _projection_weights_transposed{ nullptr };
- CLTensor _input_to_input_eff_bias{ nullptr };
- CLTensor _recurrent_to_input_eff_bias{ nullptr };
- CLTensor _input_to_forget_eff_bias{ nullptr };
- CLTensor _recurrent_to_forget_eff_bias{ nullptr };
- CLTensor _input_to_cell_eff_bias{ nullptr };
- CLTensor _recurrent_to_cell_eff_bias{ nullptr };
- CLTensor _input_to_output_eff_bias{ nullptr };
- CLTensor _recurrent_to_output_eff_bias{ nullptr };
- CLTensor _projection_reduction_res{ nullptr };
- CLTensor _projection_eff_bias{ nullptr };
- CLTensor _mm_input_to_forget_res{ nullptr };
- CLTensor _mm_recurrent_to_forget_res{ nullptr };
- CLTensor _mul_cell_to_forget_res{ nullptr };
- CLTensor _input_to_forget_outstage_res{ nullptr };
- CLTensor _cell_to_forget_outstage_res{ nullptr };
- CLTensor _recurrent_to_forget_outstage_res{ nullptr };
- CLTensor _forget_gate{ nullptr };
- CLTensor _mm_input_to_cell_res{ nullptr };
- CLTensor _input_to_cell_outstage_res{ nullptr };
- CLTensor _mm_recurrent_to_cell_res{ nullptr };
- CLTensor _recurrent_to_cell_outstage_res{ nullptr };
- CLTensor _cell_gate{ nullptr };
- CLTensor _mul_input_cell_res{ nullptr };
- CLTensor _mm_input_to_input_res{ nullptr };
- CLTensor _input_to_input_outstage_res{ nullptr };
- CLTensor _mm_recurrent_to_input_res{ nullptr };
- CLTensor _mul_cell_to_input_res{ nullptr };
- CLTensor _cell_to_input_outstage_res{ nullptr };
- CLTensor _recurrent_to_input_outstage_res{ nullptr };
- CLTensor _input_gate{ nullptr };
- CLTensor _mm_input_to_output_res{ nullptr };
- CLTensor _input_to_output_outstage_res{ nullptr };
- CLTensor _mm_recurrent_to_output_res{ nullptr };
- CLTensor _mul_cell_to_output_res{ nullptr };
- CLTensor _cell_to_output_outstage_res{ nullptr };
- CLTensor _recurrent_to_output_outstage_res{ nullptr };
- CLTensor _output_gate{ nullptr };
- CLTensor _hidden_mul_res{ nullptr };
- CLTensor _hidden_gate{ nullptr };
- CLTensor _mm_projection_res{ nullptr };
- CLTensor _projection_outstage_res{ nullptr };
- CLTensor _projection_out_res{ nullptr };
- CLTensor _projection_accumulate_res{ nullptr };
- CLTensor _ones{ nullptr };
- std::array<CLTensor, _layer_norm_count> _layer_norm_output{ {} };
+ CLTensor _input_to_forget_weights_transposed{nullptr};
+ CLTensor _input_to_cell_weights_transposed{nullptr};
+ CLTensor _input_to_output_weights_transposed{nullptr};
+ CLTensor _input_to_input_weights_transposed{nullptr};
+ CLTensor _recurrent_to_forget_weights_transposed{nullptr};
+ CLTensor _recurrent_to_cell_weights_transposed{nullptr};
+ CLTensor _recurrent_to_output_weights_transposed{nullptr};
+ CLTensor _recurrent_to_input_weights_transposed{nullptr};
+ CLTensor _projection_weights_transposed{nullptr};
+ CLTensor _input_to_input_eff_bias{nullptr};
+ CLTensor _recurrent_to_input_eff_bias{nullptr};
+ CLTensor _input_to_forget_eff_bias{nullptr};
+ CLTensor _recurrent_to_forget_eff_bias{nullptr};
+ CLTensor _input_to_cell_eff_bias{nullptr};
+ CLTensor _recurrent_to_cell_eff_bias{nullptr};
+ CLTensor _input_to_output_eff_bias{nullptr};
+ CLTensor _recurrent_to_output_eff_bias{nullptr};
+ CLTensor _projection_reduction_res{nullptr};
+ CLTensor _projection_eff_bias{nullptr};
+ CLTensor _mm_input_to_forget_res{nullptr};
+ CLTensor _mm_recurrent_to_forget_res{nullptr};
+ CLTensor _mul_cell_to_forget_res{nullptr};
+ CLTensor _input_to_forget_outstage_res{nullptr};
+ CLTensor _cell_to_forget_outstage_res{nullptr};
+ CLTensor _recurrent_to_forget_outstage_res{nullptr};
+ CLTensor _forget_gate{nullptr};
+ CLTensor _mm_input_to_cell_res{nullptr};
+ CLTensor _input_to_cell_outstage_res{nullptr};
+ CLTensor _mm_recurrent_to_cell_res{nullptr};
+ CLTensor _recurrent_to_cell_outstage_res{nullptr};
+ CLTensor _cell_gate{nullptr};
+ CLTensor _mul_input_cell_res{nullptr};
+ CLTensor _mm_input_to_input_res{nullptr};
+ CLTensor _input_to_input_outstage_res{nullptr};
+ CLTensor _mm_recurrent_to_input_res{nullptr};
+ CLTensor _mul_cell_to_input_res{nullptr};
+ CLTensor _cell_to_input_outstage_res{nullptr};
+ CLTensor _recurrent_to_input_outstage_res{nullptr};
+ CLTensor _input_gate{nullptr};
+ CLTensor _mm_input_to_output_res{nullptr};
+ CLTensor _input_to_output_outstage_res{nullptr};
+ CLTensor _mm_recurrent_to_output_res{nullptr};
+ CLTensor _mul_cell_to_output_res{nullptr};
+ CLTensor _cell_to_output_outstage_res{nullptr};
+ CLTensor _recurrent_to_output_outstage_res{nullptr};
+ CLTensor _output_gate{nullptr};
+ CLTensor _hidden_mul_res{nullptr};
+ CLTensor _hidden_gate{nullptr};
+ CLTensor _mm_projection_res{nullptr};
+ CLTensor _projection_outstage_res{nullptr};
+ CLTensor _projection_out_res{nullptr};
+ CLTensor _projection_accumulate_res{nullptr};
+ CLTensor _ones{nullptr};
+ std::array<CLTensor, _layer_norm_count> _layer_norm_output{{}};
inline CLTensor &get_layer_norm_output(LayerNormGate g)
{
return _layer_norm_output[getGateIndex(g)];
}
- bool _is_prepared{ false };
- bool _has_cifg{ false };
- bool _has_cell_clipping{ false };
- bool _has_projection{ false };
- bool _has_projection_clipping{ false };
- bool _has_peephole{ false };
- bool _has_layer_norm{ false };
- bool _projection_tensor_copy_required{ false };
+ bool _is_prepared{false};
+ bool _has_cifg{false};
+ bool _has_cell_clipping{false};
+ bool _has_projection{false};
+ bool _has_projection_clipping{false};
+ bool _has_peephole{false};
+ bool _has_layer_norm{false};
+ bool _projection_tensor_copy_required{false};
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLQLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 2b3b35e37d..a8d835d04d 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -24,12 +24,12 @@
#ifndef ARM_COMPUTE_CLRNN_LAYER_H
#define ARM_COMPUTE_CLRNN_LAYER_H
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/CL/functions/CLCopy.h"
#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include <memory>
@@ -69,7 +69,13 @@ public:
* @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
* @param[in] info Activation layer parameter.
*/
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info);
+ void configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *recurrent_weights,
+ const ICLTensor *bias,
+ ICLTensor *hidden_state,
+ ICLTensor *output,
+ ActivationLayerInfo &info);
/** Initialize the function
*
* @param[in] compile_context The compile context to be used.
@@ -81,8 +87,14 @@ public:
* @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
* @param[in] info Activation layer parameter.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
- ICLTensor *output, ActivationLayerInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *recurrent_weights,
+ const ICLTensor *bias,
+ ICLTensor *hidden_state,
+ ICLTensor *output,
+ ActivationLayerInfo &info);
/** Initialize the function
*
* @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -95,7 +107,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *hidden_state,
+ const ITensorInfo *output,
const ActivationLayerInfo &info);
// Inherited methods overridden:
@@ -114,5 +131,5 @@ private:
CLTensor _add_output;
bool _is_prepared;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLRNN_LAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
index 1eaea1b297..14d3476711 100644
--- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
@@ -68,7 +68,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -84,7 +85,11 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayer
*
* @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -100,7 +105,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
index 151586a1f6..86294596d2 100644
--- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
@@ -66,7 +66,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -81,7 +82,11 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLROIPoolingLayer
*
@@ -97,7 +102,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLROIPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h
index fbce05162c..ed665bc398 100644
--- a/arm_compute/runtime/CL/functions/CLRange.h
+++ b/arm_compute/runtime/CL/functions/CLRange.h
@@ -73,7 +73,8 @@ public:
* @param[in] end The ending (not including) value of the sequence.
* @param[in] step The gap between each pair of values in the sequence. Default is 1.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
+ void
+ configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
/** Static function to check if given info will lead to a valid configuration of @ref CLRange
*
* @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 1ce088b2ce..640fe7cf1b 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -24,12 +24,12 @@
#ifndef ARM_COMPUTE_CL_REDUCE_MEAN_H
#define ARM_COMPUTE_CL_REDUCE_MEAN_H
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
namespace arm_compute
@@ -74,7 +74,11 @@ public:
* @param[in] keep_dims If positive, retains reduced dimensions with length 1.
* @param[out] output Destination tensor. Data type supported: Same as @p input
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const Coordinates &reduction_axis,
+ bool keep_dims,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLReduceMean
*
@@ -85,7 +89,8 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+ static Status
+ validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 2245735b62..80068ac35c 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -80,7 +80,8 @@ public:
* @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
* @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
*/
- void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+ void
+ configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -90,7 +91,12 @@ public:
* @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
* @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op,
+ bool keep_dims = true);
/** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
*
@@ -102,7 +108,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op,
+ bool keep_dims = true);
// Inherited methods overridden:
void run() override;
@@ -118,4 +128,4 @@ private:
bool _is_reshape_required;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
deleted file mode 100644
index f69b045c9b..0000000000
--- a/arm_compute/runtime/CL/functions/CLRemap.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAP_H
-#define ARM_COMPUTE_CLREMAP_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to execute remap. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLRemapKernel
- *
- */
-class CLRemap : public ICLSimpleFunction
-{
-public:
- /** Initialise the function's sources, destination, interpolation policy and border mode.
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src0 |src1 |src2 |dst |
- * |:------|:------|:------|:------|
- * |U8 |F32 |F32 |U 8 |
- *
- * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
- * @param[in] map_x Map for X coords. Data types supported: F32.
- * @param[in] map_y Map for Y coords. Data types supported: F32.
- * @param[out] output Output tensor. Data types supported: U8.
- * @param[in] policy Interpolation policy to use. Only NEAREST and BILINEAR are supported.
- * @param[in] border_mode Border mode to use on the input tensor. Only CONSTANT and UNDEFINED are supported.
- * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
- *
- */
- void configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
- InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
- /** Initialise the function's sources, destination, interpolation policy and border mode.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
- * @param[in] map_x Map for X coords. Data types supported: F32.
- * @param[in] map_y Map for Y coords. Data types supported: F32.
- * @param[out] output Output tensor. Data types supported: U8.
- * @param[in] policy Interpolation policy to use. Only NEAREST and BILINEAR are supported.
- * @param[in] border_mode Border mode to use on the input tensor. Only CONSTANT and UNDEFINED are supported.
- * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
- *
- */
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
- InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLREMAP_H */
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index 7346b65e9b..dad90e6ba9 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -26,6 +26,7 @@
#include "arm_compute/runtime/CL/ICLOperator.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
#include <memory>
namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h
index 94c63ca92d..46229540b4 100644
--- a/arm_compute/runtime/CL/functions/CLReverse.h
+++ b/arm_compute/runtime/CL/functions/CLReverse.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLREVERSE_H
-#define ARM_COMPUTE_CLREVERSE_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
@@ -45,30 +45,38 @@ public:
* Valid data type configurations:
* |src0 |src1 |dst |
* |:--------------|:--------------|:--------------|
- * |All |U32 |All |
+ * |All |U32, S32 |All |
*
- * @param[in] input Input tensor. Data types supported: All.
- * @param[out] output Output tensor. Data type supported: Same as @p input
- * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
*/
- void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
/** Initialize the function
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data types supported: All.
- * @param[out] output Output tensor. Data type supported: Same as @p input
- * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis);
/** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
*
- * @param[in] input Input tensor info. Data types supported: All.
- * @param[in] output Output tensor info. Data type supported: Same as @p input
- * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+ * @param[in] input Input tensor info. Data types supported: All.
+ * @param[in] output Output tensor info. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLREVERSE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index ddb4a23531..5c3824eb58 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -83,7 +83,10 @@ public:
* All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] info @ref ScaleKernelInfo descriptor to be used to configure
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ScaleKernelInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLScale
*
diff --git a/arm_compute/runtime/CL/functions/CLScatter.h b/arm_compute/runtime/CL/functions/CLScatter.h
new file mode 100644
index 0000000000..973953624e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLScatter.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSCATTER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSCATTER_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+class ITensorInfo;
+struct ScatterInfo;
+class CLCompileContext;
+
+/** Function to compute ScatterND Layer */
+class CLScatter : public IFunction
+{
+public:
+ /** Default Constructor */
+ CLScatter();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLScatter(const CLScatter &) = delete;
+ /** Default move constructor */
+ CLScatter(CLScatter &&);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLScatter &operator=(const CLScatter &) = delete;
+ /** Default move assignment operator */
+ CLScatter &operator=(CLScatter &&);
+ /** Default destructor */
+ ~CLScatter();
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @note Negative indices are treated as out of bounds.
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. Values used to fill output. Can be nullptr when zero initialization is true.
+ * @param[in] updates Tensor containing values used to update output tensor. Data types supported: same as @p src
+ * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : S32
+ * @param[out] output Destination tensor. Data types supported: same as @p src.
+ * @param[in] info Scatter info object.
+ */
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *src,
+ const ICLTensor *updates,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const ScatterInfo &info);
+ /** Initialise the kernel's inputs and output
+ *
+ * Similar to @ref CLScatter::configure()
+ */
+ void configure(const ICLTensor *src,
+ const ICLTensor *updates,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const ScatterInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLScatter
+ *
+ * @param[in] src Source tensor.
+ * @param[in] updates Tensor containing values used for updating the output Tensor. Data types supported : same as @p src
+ * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : S32
+ * @param[in] output Destination tensor. Data types supported: same as @p src.
+ * @param[in] info Scatter info containing type of scatter.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *updates,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const ScatterInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSCATTER_H
diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h
index 8b1e6b2019..effcb58313 100644
--- a/arm_compute/runtime/CL/functions/CLSelect.h
+++ b/arm_compute/runtime/CL/functions/CLSelect.h
@@ -62,7 +62,11 @@ public:
* @param[in] y Second input tensor. Data types supported: Same as @p x
* @param[out] output Output tensor. Data types supported: Same as @p x.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSelect
*
* @param[in] c Condition input tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h
index 297bcd86fe..7a274ded72 100644
--- a/arm_compute/runtime/CL/functions/CLSlice.h
+++ b/arm_compute/runtime/CL/functions/CLSlice.h
@@ -84,7 +84,11 @@ public:
* @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
* @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends);
/** Static function to check if given info will lead to a valid configuration of @ref CLSlice
*
@@ -100,7 +104,8 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
// Inherited methods overridden:
void run() override;
@@ -129,7 +134,11 @@ public:
* @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
* @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
*/
- void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends);
/** Static function to check if given info will lead to a valid configuration of @ref CLSlice
*
@@ -145,7 +154,8 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
};
} // namespace experimental
} // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 687f8ff6d8..68541e35c5 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H
-#define ARM_COMPUTE_CLSOFTMAXLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
@@ -43,12 +43,6 @@ class CLCompileContext;
*
* Log Softmax is calculated by :
* @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
- *
- * This function runs the following operators/kernels:
- * -# If axis is not 0:
- * -# @ref opencl::ClPermute
- * -# @ref opencl::kernels::ClLogits1DNormKernel
- * -# @ref opencl::kernels::ClLogits1DMaxShiftExpSumKernel
*/
template <bool IS_LOG = false>
class CLSoftmaxLayerGeneric : public IFunction
@@ -87,7 +81,11 @@ public:
* @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
* axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ float beta = 1.0f,
+ int32_t axis = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
*
* @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
@@ -111,4 +109,4 @@ private:
using CLSoftmaxLayer = CLSoftmaxLayerGeneric<false>;
using CLLogSoftmaxLayer = CLSoftmaxLayerGeneric<true>;
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSOFTMAXLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index 304a74137e..191f4863d5 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -83,7 +83,11 @@ public:
* @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output);
/** Set the input and output tensors. (Static block shape and paddings)
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -93,7 +97,12 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+ void configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
/** Set the input and output tensors. (Static block shape and paddings)
*
* @param[in] compile_context The compile context to be used.
@@ -104,8 +113,13 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -115,7 +129,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer (Static block shape and paddings)
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -127,7 +144,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
index 8a47e95f9d..1b0dfc2b74 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
@@ -75,7 +75,8 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] block_shape Block shape value.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+ void
+ configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
/** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayer.
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLSplit.h b/arm_compute/runtime/CL/functions/CLSplit.h
index 86c7bdde7d..8d13755212 100644
--- a/arm_compute/runtime/CL/functions/CLSplit.h
+++ b/arm_compute/runtime/CL/functions/CLSplit.h
@@ -26,7 +26,6 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
-
#include "arm_compute/runtime/CL/functions/CLSlice.h"
#include "arm_compute/runtime/CPP/functions/CPPSplit.h"
#include "arm_compute/runtime/IFunction.h"
diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h
index 54c903a706..18745c8a4f 100644
--- a/arm_compute/runtime/CL/functions/CLStackLayer.h
+++ b/arm_compute/runtime/CL/functions/CLStackLayer.h
@@ -85,7 +85,10 @@ public:
* Negative values wrap around
* @param[out] output Output tensor. Data types supported: Same as @p input.
*/
- void configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output);
+ void configure(const CLCompileContext &compile_context,
+ const std::vector<ICLTensor *> &input,
+ int axis,
+ ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
*
* @note Supported input tensor rank: up to 4
diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
index 6fab0c0186..b1edc2481c 100644
--- a/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -74,9 +74,14 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ void configure(const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
/** Configure kernel
*
* @note Supported tensor rank: up to 4
@@ -92,9 +97,15 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
*
@@ -110,9 +121,14 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
// Inherited methods overridden:
void run() override;
@@ -143,9 +159,15 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ void configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
*
@@ -161,9 +183,14 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
};
} // namespace experimental
} // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h
index c266adbbd4..4c414670a5 100644
--- a/arm_compute/runtime/CL/functions/CLTile.h
+++ b/arm_compute/runtime/CL/functions/CLTile.h
@@ -59,7 +59,10 @@ public:
* @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
* @param[out] output Destination tensor. Same as @p input
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples);
/** Static function to check if given info will lead to a valid configuration of @ref CLTile
*
* @param[in] input Source tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
index a866aeabaa..9dc977fbeb 100644
--- a/arm_compute/runtime/CL/functions/CLTranspose.h
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -88,6 +88,6 @@ private:
struct Impl;
std::unique_ptr<Impl> _impl;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CLTRANSPOSE_H */
diff --git a/arm_compute/runtime/CL/functions/CLUnstack.h b/arm_compute/runtime/CL/functions/CLUnstack.h
index 32ad439b70..a6eee43177 100644
--- a/arm_compute/runtime/CL/functions/CLUnstack.h
+++ b/arm_compute/runtime/CL/functions/CLUnstack.h
@@ -26,9 +26,8 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -72,7 +71,10 @@ public:
* @param[in] axis The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
*
*/
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis);
+ void configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const std::vector<ICLTensor *> &output_vector,
+ int axis);
/** Static function to check if given info will lead to a valid configuration of @ref CLUnstack
*
* @param[in] input Input tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index 4b351267e3..efea9a1550 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
@@ -83,8 +84,13 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ void configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
/** Set the input and output tensors.
*
* @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
@@ -103,8 +109,14 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ void configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
/** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
*
* @note: This function only works with 3x3,3x1,1x3,5x5,5x1 and 1x5 kernels along with unit strides for both NCHW and NHWC data layout
@@ -124,8 +136,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
index 69572c98d2..5f6d12b4a7 100644
--- a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
+++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
@@ -29,6 +29,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/CL/CLTunerTypes.h"
#include "arm_compute/runtime/CL/CLTuningParams.h"
+
#include "support/ToolchainSupport.h"
#include <memory>
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index a5932d6301..7f70b5fa1f 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -55,10 +55,10 @@ public:
static CPPScheduler &get();
// Inherited functions overridden
- void set_num_threads(unsigned int num_threads) override;
- void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
+ void set_num_threads(unsigned int num_threads) override;
+ void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
unsigned int num_threads() const override;
- void schedule(ICPPKernel *kernel, const Hints &hints) override;
+ void schedule(ICPPKernel *kernel, const Hints &hints) override;
void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
protected:
diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
index 58b4bf25cc..9af4ed6208 100644
--- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
@@ -61,8 +61,16 @@ public:
* @param[in] keeps_size (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32.
* @param[in] info (Optional) BoxNMSLimitInfo information.
*/
- void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
- ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+ void configure(const ITensor *scores_in,
+ const ITensor *boxes_in,
+ const ITensor *batch_splits_in,
+ ITensor *scores_out,
+ ITensor *boxes_out,
+ ITensor *classes,
+ ITensor *batch_splits_out = nullptr,
+ ITensor *keeps = nullptr,
+ ITensor *keeps_size = nullptr,
+ const BoxNMSLimitInfo info = BoxNMSLimitInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
*
* @param[in] scores_in The scores input tensor of size [count, num_classes]. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -81,9 +89,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out,
- const ITensorInfo *classes,
- const ITensorInfo *batch_splits_out = nullptr, const ITensorInfo *keeps = nullptr, const ITensorInfo *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+ static Status validate(const ITensorInfo *scores_in,
+ const ITensorInfo *boxes_in,
+ const ITensorInfo *batch_splits_in,
+ const ITensorInfo *scores_out,
+ const ITensorInfo *boxes_out,
+ const ITensorInfo *classes,
+ const ITensorInfo *batch_splits_out = nullptr,
+ const ITensorInfo *keeps = nullptr,
+ const ITensorInfo *keeps_size = nullptr,
+ const BoxNMSLimitInfo info = BoxNMSLimitInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
index f2c7ccccc5..dc8c8e76ba 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
#define ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
namespace arm_compute
{
@@ -52,7 +51,11 @@ public:
*
* @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
*/
- void configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
+ void configure(const ITensor *input_loc,
+ const ITensor *input_conf,
+ const ITensor *input_priorbox,
+ ITensor *output,
+ DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
*
* @param[in] input_loc The mbox location input tensor info. Data types supported: F32.
@@ -63,7 +66,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output,
+ static Status validate(const ITensorInfo *input_loc,
+ const ITensorInfo *input_conf,
+ const ITensorInfo *input_priorbox,
+ const ITensorInfo *output,
DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -82,12 +88,12 @@ private:
int _num_priors;
int _num;
- std::vector<LabelBBox> _all_location_predictions;
+ std::vector<LabelBBox> _all_location_predictions;
std::vector<std::map<int, std::vector<float>>> _all_confidence_scores;
- std::vector<BBox> _all_prior_bboxes;
- std::vector<std::array<float, 4>> _all_prior_variances;
- std::vector<LabelBBox> _all_decode_bboxes;
- std::vector<std::map<int, std::vector<int>>> _all_indices;
+ std::vector<BBox> _all_prior_bboxes;
+ std::vector<std::array<float, 4>> _all_prior_variances;
+ std::vector<LabelBBox> _all_decode_bboxes;
+ std::vector<std::map<int, std::vector<int>>> _all_indices;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
index 94248ff314..a40e4f9ecb 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
@@ -24,10 +24,9 @@
#ifndef ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
#define ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
@@ -65,8 +64,14 @@ public:
*
* @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
*/
- void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
- ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+ void configure(const ITensor *input_box_encoding,
+ const ITensor *input_score,
+ const ITensor *input_anchors,
+ ITensor *output_boxes,
+ ITensor *output_classes,
+ ITensor *output_scores,
+ ITensor *num_detection,
+ DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionPostProcessLayer
*
* @param[in] input_box_encoding The bounding box input tensor info. Data types supported: F32/QASYMM8/QASYMM8_SIGNED.
@@ -80,8 +85,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
- ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+ static Status validate(const ITensorInfo *input_box_encoding,
+ const ITensorInfo *input_class_score,
+ const ITensorInfo *input_anchors,
+ ITensorInfo *output_boxes,
+ ITensorInfo *output_classes,
+ ITensorInfo *output_scores,
+ ITensorInfo *num_detection,
DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
index 71c44a8bd1..af6afc6029 100644
--- a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
+++ b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
#define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
namespace arm_compute
{
@@ -48,7 +47,12 @@ public:
* @param[in] nms_threshold The threshold used in non maximum suppression.
*
*/
- void configure(const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, const float score_threshold, const float nms_threshold);
+ void configure(const ITensor *bboxes,
+ const ITensor *scores,
+ ITensor *indices,
+ unsigned int max_output_size,
+ const float score_threshold,
+ const float nms_threshold);
/** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppression
*
@@ -60,8 +64,12 @@ public:
* @param[in] nms_threshold The threshold used in non maximum suppression.
*
*/
- static Status validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
- const float score_threshold, const float nms_threshold);
+ static Status validate(const ITensorInfo *bboxes,
+ const ITensorInfo *scores,
+ const ITensorInfo *indices,
+ unsigned int max_output_size,
+ const float score_threshold,
+ const float nms_threshold);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPPermute.h b/arm_compute/runtime/CPP/functions/CPPPermute.h
index 85c1502324..232da41b8e 100644
--- a/arm_compute/runtime/CPP/functions/CPPPermute.h
+++ b/arm_compute/runtime/CPP/functions/CPPPermute.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CPPPERMUTE_H
#define ARM_COMPUTE_CPPPERMUTE_H
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
namespace arm_compute
{
@@ -53,5 +52,5 @@ public:
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CPPPERMUTE_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPSplit.h b/arm_compute/runtime/CPP/functions/CPPSplit.h
index b2b4d07c86..9be081f5bb 100644
--- a/arm_compute/runtime/CPP/functions/CPPSplit.h
+++ b/arm_compute/runtime/CPP/functions/CPPSplit.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021,2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,9 +29,6 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/ToolchainSupport.h"
-
#include "arm_compute/runtime/IFunction.h"
namespace arm_compute
@@ -41,8 +38,7 @@ template <typename SliceType, typename TensorInterfaceType = ITensor>
class CPPSplit : public IFunction
{
public:
- CPPSplit()
- : _outputs_vector(), _slice_functions(), _num_outputs(0)
+ CPPSplit() : _outputs_vector(), _slice_functions(), _num_outputs(0)
{
}
/** Static function to check if given info will lead to a valid configuration of @ref CPPSplit
@@ -66,14 +62,16 @@ public:
unsigned int total_output_shape_size = 0;
// Sum the output sizes and fall back to evenly-sized splits if any are zero
- const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(), [&total_output_shape_size](ITensorInfo * info)
- {
- unsigned int output_shape_size = info->tensor_shape().total_size();
- total_output_shape_size += output_shape_size;
- return output_shape_size == 0;
- });
-
- if(using_split_shapes)
+ const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(),
+ [&total_output_shape_size](ITensorInfo *info)
+ {
+ unsigned int output_shape_size =
+ info->tensor_shape().total_size();
+ total_output_shape_size += output_shape_size;
+ return output_shape_size == 0;
+ });
+
+ if (using_split_shapes)
{
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != total_output_shape_size);
}
@@ -85,10 +83,10 @@ public:
// Validate output tensors
unsigned int axis_offset = 0;
- for(const auto &output : outputs)
+ for (const auto &output : outputs)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- if(using_split_shapes)
+ if (using_split_shapes)
{
output_shape = output->tensor_shape();
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
@@ -99,14 +97,14 @@ public:
// Start/End coordinates
Coordinates start_coords;
Coordinates end_coords;
- for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
{
end_coords.set(d, -1);
}
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- if(tmp_output_info.tensor_shape().total_size() == 0)
+ if (tmp_output_info.tensor_shape().total_size() == 0)
{
tmp_output_info = input->clone()->set_is_resizable(true).set_tensor_shape(output_shape);
}
@@ -130,7 +128,8 @@ public:
* from the split dimension.
* @param[in] axis Axis on which to split the input.
*/
- void configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
+ void
+ configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
{
// Create Slice functions
_num_outputs = outputs.size();
@@ -138,17 +137,16 @@ public:
// Extract output tensor info
std::vector<ITensorInfo *> outputs_info;
- for(auto &output : outputs)
+ for (auto &output : outputs)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
outputs_info.emplace_back(output->info());
}
// If any of the outputs have a zero size, fall-back to using evenly-sized output splits
- const bool outputs_have_sizes = std::none_of(outputs_info.begin(), outputs_info.end(), [](ITensorInfo * info)
- {
- return info->tensor_shape().total_size() == 0;
- });
+ const bool outputs_have_sizes =
+ std::none_of(outputs_info.begin(), outputs_info.end(),
+ [](ITensorInfo *info) { return info->tensor_shape().total_size() == 0; });
// Validate
ARM_COMPUTE_ERROR_THROW_ON(CPPSplit::validate(input->info(), outputs_info, axis));
@@ -156,12 +154,13 @@ public:
unsigned int axis_offset = 0;
unsigned int i = 0;
- for(const auto &output_info : outputs_info)
+ for (const auto &output_info : outputs_info)
{
// Get output shape
- TensorShape output_shape = (outputs_have_sizes ?
- output_info->tensor_shape() :
- arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
+ TensorShape output_shape =
+ (outputs_have_sizes
+ ? output_info->tensor_shape()
+ : arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
const size_t axis_split_step = output_shape[axis];
@@ -169,7 +168,7 @@ public:
Coordinates start_coords;
Coordinates end_coords;
- for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
{
end_coords.set(d, -1);
}
diff --git a/arm_compute/runtime/CPP/functions/CPPTopKV.h b/arm_compute/runtime/CPP/functions/CPPTopKV.h
index 2f63084056..232cbb3067 100644
--- a/arm_compute/runtime/CPP/functions/CPPTopKV.h
+++ b/arm_compute/runtime/CPP/functions/CPPTopKV.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CPPTOPKV_H
#define ARM_COMPUTE_CPPTOPKV_H
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
namespace arm_compute
{
@@ -54,7 +53,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+ static Status
+ validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CPPTOPKV_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPUpsample.h b/arm_compute/runtime/CPP/functions/CPPUpsample.h
index b97d4d1cc1..3b0f997b17 100644
--- a/arm_compute/runtime/CPP/functions/CPPUpsample.h
+++ b/arm_compute/runtime/CPP/functions/CPPUpsample.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_CPPUPSAMPLE_H
#define ARM_COMPUTE_CPPUPSAMPLE_H
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
namespace arm_compute
{
@@ -44,5 +43,5 @@ public:
*/
void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CPPUPSAMPLE_H */
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 1f4216eb21..c3af17d6f2 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
-#define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include <utility>
@@ -40,19 +41,19 @@ enum class FFTDirection
/** Descriptor used by the FFT1D function */
struct FFT1DInfo
{
- unsigned int axis{ 0 }; /**< Axis to run the FFT on. */
- FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+ unsigned int axis{0}; /**< Axis to run the FFT on. */
+ FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
};
/** Descriptor used by the FFT2D function */
struct FFT2DInfo
{
- unsigned int axis0{ 0 }; /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
- unsigned int axis1{ 1 }; /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
- FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+ unsigned int axis0{0}; /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
+ unsigned int axis1{1}; /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
+ FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
};
-/** Descriptor used by the Convolution function */
+/** Descriptor used by the 2d Convolution function */
struct Conv2dInfo
{
Conv2dInfo() = default;
@@ -61,16 +62,52 @@ struct Conv2dInfo
const Size2D &dilation,
const ActivationLayerInfo &act_info,
bool enable_fast_math,
- unsigned int num_groups)
- : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups)
+ unsigned int num_groups,
+ const WeightsInfo &weights_info = WeightsInfo())
+ : conv_info(conv_info),
+ dilation(dilation),
+ act_info(act_info),
+ enable_fast_math(enable_fast_math),
+ num_groups(num_groups),
+ weights_info(weights_info)
{
}
PadStrideInfo conv_info{};
- Size2D dilation{ 1U, 1U };
+ Size2D dilation{1U, 1U};
ActivationLayerInfo act_info{};
- bool enable_fast_math{ false };
- unsigned int num_groups{ 1 };
+ bool enable_fast_math{false};
+ unsigned int num_groups{1};
+ WeightsInfo weights_info{};
};
+
+/** Descriptor used by the 3d Convolution function */
+struct Conv3dInfo
+{
+ Conv3dInfo() = default;
+
+ Conv3dInfo(const Size3D &stride,
+ const Padding3D &padding,
+ const ActivationLayerInfo &act_info,
+ const Size3D &dilation,
+ const DimensionRoundingType &round_type,
+ bool enable_fast_math)
+ : stride(stride),
+ padding(padding),
+ act_info(act_info),
+ dilation(dilation),
+ round_type(round_type),
+ enable_fast_math(enable_fast_math)
+ {
+ }
+
+ Size3D stride{1U, 1U, 1U};
+ Padding3D padding{};
+ ActivationLayerInfo act_info{};
+ Size3D dilation{1U, 1U, 1U};
+ DimensionRoundingType round_type{};
+ bool enable_fast_math{false};
+};
+
} // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
diff --git a/arm_compute/runtime/IAllocator.h b/arm_compute/runtime/IAllocator.h
index 5c28b24fea..f8446db811 100644
--- a/arm_compute/runtime/IAllocator.h
+++ b/arm_compute/runtime/IAllocator.h
@@ -56,5 +56,5 @@ public:
*/
virtual std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) = 0;
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_IALLOCATOR_H */
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index b7b28f999d..fb68dbbecf 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -58,5 +58,5 @@ public:
{
}
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_IFUNCTION_H */
diff --git a/arm_compute/runtime/IMemoryGroup.h b/arm_compute/runtime/IMemoryGroup.h
index a977a4a3c3..77198dd29d 100644
--- a/arm_compute/runtime/IMemoryGroup.h
+++ b/arm_compute/runtime/IMemoryGroup.h
@@ -86,8 +86,7 @@ public:
*
* @param[in] memory_group Memory group to handle
*/
- explicit MemoryGroupResourceScope(IMemoryGroup &memory_group)
- : _memory_group(memory_group)
+ explicit MemoryGroupResourceScope(IMemoryGroup &memory_group) : _memory_group(memory_group)
{
_memory_group.acquire();
}
@@ -100,5 +99,5 @@ public:
private:
IMemoryGroup &_memory_group;
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_IMEMORYGROUP_H */
diff --git a/arm_compute/runtime/IMemoryManager.h b/arm_compute/runtime/IMemoryManager.h
index 4d7d8cd9c9..42910edfda 100644
--- a/arm_compute/runtime/IMemoryManager.h
+++ b/arm_compute/runtime/IMemoryManager.h
@@ -65,5 +65,5 @@ public:
*/
virtual void clear() = 0;
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_IMEMORYMANAGER_H */
diff --git a/arm_compute/runtime/IMemoryPool.h b/arm_compute/runtime/IMemoryPool.h
index b8d36c362d..0c112c8f35 100644
--- a/arm_compute/runtime/IMemoryPool.h
+++ b/arm_compute/runtime/IMemoryPool.h
@@ -60,5 +60,5 @@ public:
*/
virtual std::unique_ptr<IMemoryPool> duplicate() = 0;
};
-} // arm_compute
+} // namespace arm_compute
#endif /* ARM_COMPUTE_IMEMORYPOOL_H */
diff --git a/arm_compute/runtime/IMemoryRegion.h b/arm_compute/runtime/IMemoryRegion.h
index 914aa57fbe..9431663e4e 100644
--- a/arm_compute/runtime/IMemoryRegion.h
+++ b/arm_compute/runtime/IMemoryRegion.h
@@ -37,8 +37,7 @@ public:
*
* @param[in] size Region size
*/
- explicit IMemoryRegion(size_t size)
- : _size(size)
+ explicit IMemoryRegion(size_t size) : _size(size)
{
}
/** Virtual Destructor */
diff --git a/arm_compute/runtime/IPoolManager.h b/arm_compute/runtime/IPoolManager.h
index 481bde5fb6..5f6d4ffbe5 100644
--- a/arm_compute/runtime/IPoolManager.h
+++ b/arm_compute/runtime/IPoolManager.h
@@ -69,5 +69,5 @@ public:
*/
virtual size_t num_pools() const = 0;
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_IPOOLMANAGER_H */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 417c62cc9c..ae204c8560 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,8 +25,8 @@
#define ARM_COMPUTE_ISCHEDULER_H
#include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Types.h"
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
#include <functional>
#include <limits>
@@ -152,7 +152,7 @@ public:
*/
virtual void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func);
- /** Returns the number of threads that the SingleThreadScheduler has in his pool.
+ /** Returns the number of threads that the SingleThreadScheduler has in its pool.
*
* @return Number of threads available in SingleThreadScheduler.
*/
@@ -176,9 +176,9 @@ public:
/** Execute all the passed workloads
*
- * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
+ * @note There is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
*
- * @param[in] workloads Array of workloads to run
+ * @param[in] workloads List of workloads to run
* @param[in] tag String that can be used by profiling tools to identify the workloads run by the scheduler (Can be null).
*/
virtual void run_tagged_workloads(std::vector<Workload> &workloads, const char *tag);
@@ -215,7 +215,22 @@ protected:
*/
void schedule_common(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors);
- CPUInfo _cpu_info{};
+ /** Adjust the number of windows to the optimize performance
+ * (used for small workloads where smaller number of threads might improve the performance)
+ *
+ * @param[in] window Window to use for kernel execution
+ * @param[in] split_dimension Axis of dimension to split
+ * @param[in] init_num_windows Initial number of sub-windows to split
+ * @param[in] kernel Kernel to execute
+ * @param[in] cpu_info The CPU platform used to create the context.
+ *
+ * @return Adjusted number of windows
+ */
+ std::size_t adjust_num_of_windows(const Window &window,
+ std::size_t split_dimension,
+ std::size_t init_num_windows,
+ const ICPPKernel &kernel,
+ const CPUInfo &cpu_info);
private:
unsigned int _num_threads_hint = {};
diff --git a/arm_compute/runtime/ISimpleLifetimeManager.h b/arm_compute/runtime/ISimpleLifetimeManager.h
index b2d17c6fea..9e481bb563 100644
--- a/arm_compute/runtime/ISimpleLifetimeManager.h
+++ b/arm_compute/runtime/ISimpleLifetimeManager.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H
#include "arm_compute/runtime/ILifetimeManager.h"
-
#include "arm_compute/runtime/IMemoryPool.h"
#include "arm_compute/runtime/Types.h"
@@ -70,7 +69,11 @@ protected:
/** Element struct */
struct Element
{
- Element(void *id_ = nullptr, IMemory *handle_ = nullptr, size_t size_ = 0, size_t alignment_ = 0, bool status_ = false)
+ Element(void *id_ = nullptr,
+ IMemory *handle_ = nullptr,
+ size_t size_ = 0,
+ size_t alignment_ = 0,
+ bool status_ = false)
: id(id_), handle(handle_), size(size_), alignment(alignment_), status(status_)
{
}
@@ -90,11 +93,12 @@ protected:
std::set<void *> bound_elements;
};
- IMemoryGroup *_active_group; /**< Active group */
- std::map<void *, Element> _active_elements; /**< A map that contains the active elements */
- std::list<Blob> _free_blobs; /**< Free blobs */
- std::list<Blob> _occupied_blobs; /**< Occupied blobs */
- std::map<IMemoryGroup *, std::map<void *, Element>> _finalized_groups; /**< A map that contains the finalized groups */
+ IMemoryGroup *_active_group; /**< Active group */
+ std::map<void *, Element> _active_elements; /**< A map that contains the active elements */
+ std::list<Blob> _free_blobs; /**< Free blobs */
+ std::list<Blob> _occupied_blobs; /**< Occupied blobs */
+ std::map<IMemoryGroup *, std::map<void *, Element>>
+ _finalized_groups; /**< A map that contains the finalized groups */
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H */
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
index 17e581b40e..e2d3536169 100644
--- a/arm_compute/runtime/ITensorAllocator.h
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -101,9 +101,9 @@ protected:
virtual void unlock() = 0;
private:
- TensorInfo _info_owned{}; /**< Tensor's metadata. */
- TensorInfo *_info_external{ nullptr }; /**< External Tensor's metadata */
- size_t _alignment{}; /**< Tensor's alignment in bytes */
+ TensorInfo _info_owned{}; /**< Tensor's metadata. */
+ TensorInfo *_info_external{nullptr}; /**< External Tensor's metadata */
+ size_t _alignment{}; /**< Tensor's alignment in bytes */
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_ITENSORALLOCATOR_H */
diff --git a/arm_compute/runtime/ITransformWeights.h b/arm_compute/runtime/ITransformWeights.h
index f85b7966c5..08671bbe3c 100644
--- a/arm_compute/runtime/ITransformWeights.h
+++ b/arm_compute/runtime/ITransformWeights.h
@@ -72,7 +72,7 @@ public:
/** Allow instances of this class to be moved */
ITransformWeights &operator=(ITransformWeights &&other)
{
- if(this != &other)
+ if (this != &other)
{
_num_refcount = other._num_refcount.load();
_reshape_run = other._reshape_run;
@@ -119,9 +119,9 @@ public:
}
protected:
- std::atomic<int32_t> _num_refcount{ 0 };
- bool _reshape_run{ false };
+ std::atomic<int32_t> _num_refcount{0};
+ bool _reshape_run{false};
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_ITRANSFORMWEIGHTS_H */
diff --git a/arm_compute/runtime/IWeightsManager.h b/arm_compute/runtime/IWeightsManager.h
index 12aa1da8a7..de8a92faa3 100644
--- a/arm_compute/runtime/IWeightsManager.h
+++ b/arm_compute/runtime/IWeightsManager.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -76,10 +76,28 @@ public:
* @return True if the weights tensor is managed else false
*/
bool are_weights_managed(const ITensor *weights);
+ /** Release weights refcount and mark as unused if reaches 0
+ *
+ * @param[in] weights Weights to release
+ */
+ void release(const ITensor *weights);
+ /** Pre-mark the weights as unused. The weights tensor will get marked as unused only when the counter goes to 0
+ *
+ * @param weights Weights to mark unused
+ */
+ void pre_mark_as_unused(const ITensor *weights);
+
+private:
+ struct CounterElement
+ {
+ bool is_unused{false};
+ std::atomic<int> counter{1};
+ };
private:
std::map<const ITensor *, std::vector<ITransformWeights *>> _managed_weights;
+ std::map<const ITensor *, CounterElement> _managed_counter;
std::map<const ITensor *, ITransformWeights *> _managed_weights_parents;
};
-} // arm_compute
-#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */ \ No newline at end of file
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */
diff --git a/arm_compute/runtime/Memory.h b/arm_compute/runtime/Memory.h
index 1eab605d50..63514c409b 100644
--- a/arm_compute/runtime/Memory.h
+++ b/arm_compute/runtime/Memory.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_MEMORY_H
#include "arm_compute/runtime/IMemory.h"
-
#include "arm_compute/runtime/IMemoryRegion.h"
#include <cstddef>
@@ -64,8 +63,8 @@ public:
// Inherited methods overridden:
IMemoryRegion *region() final;
IMemoryRegion *region() const final;
- void set_region(IMemoryRegion *region) final;
- void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+ void set_region(IMemoryRegion *region) final;
+ void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
private:
IMemoryRegion *_region;
diff --git a/arm_compute/runtime/MemoryGroup.h b/arm_compute/runtime/MemoryGroup.h
index 9fd2b9fa72..93ea3d2c72 100644
--- a/arm_compute/runtime/MemoryGroup.h
+++ b/arm_compute/runtime/MemoryGroup.h
@@ -24,10 +24,9 @@
#ifndef ARM_COMPUTE_MEMORYGROUP_H
#define ARM_COMPUTE_MEMORYGROUP_H
-#include "arm_compute/runtime/IMemoryGroup.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IMemoryPool.h"
@@ -59,8 +58,8 @@ public:
// Inherited methods overridden:
void manage(IMemoryManageable *obj) override;
void finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment) override;
- void acquire() override;
- void release() override;
+ void acquire() override;
+ void release() override;
MemoryMappings &mappings() override;
private:
@@ -70,15 +69,13 @@ private:
};
inline MemoryGroup::MemoryGroup(std::shared_ptr<IMemoryManager> memory_manager) noexcept
- : _memory_manager(memory_manager),
- _pool(nullptr),
- _mappings()
+ : _memory_manager(memory_manager), _pool(nullptr), _mappings()
{
}
inline void MemoryGroup::manage(IMemoryManageable *obj)
{
- if(_memory_manager && (obj != nullptr))
+ if (_memory_manager && (obj != nullptr))
{
ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
@@ -95,7 +92,7 @@ inline void MemoryGroup::manage(IMemoryManageable *obj)
inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment)
{
- if(_memory_manager)
+ if (_memory_manager)
{
ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
_memory_manager->lifetime_manager()->end_lifetime(obj, obj_memory, size, alignment);
@@ -104,7 +101,7 @@ inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_me
inline void MemoryGroup::acquire()
{
- if(!_mappings.empty())
+ if (!_mappings.empty())
{
ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
_pool = _memory_manager->pool_manager()->lock_pool();
@@ -114,7 +111,7 @@ inline void MemoryGroup::acquire()
inline void MemoryGroup::release()
{
- if(_pool != nullptr)
+ if (_pool != nullptr)
{
ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
ARM_COMPUTE_ERROR_ON(_mappings.empty());
@@ -128,5 +125,5 @@ inline MemoryMappings &MemoryGroup::mappings()
{
return _mappings;
}
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_MEMORYGROUP_H */
diff --git a/arm_compute/runtime/MemoryManagerOnDemand.h b/arm_compute/runtime/MemoryManagerOnDemand.h
index 50547ac38e..7c31fe7f5a 100644
--- a/arm_compute/runtime/MemoryManagerOnDemand.h
+++ b/arm_compute/runtime/MemoryManagerOnDemand.h
@@ -24,10 +24,9 @@
#ifndef ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
#define ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
-#include "arm_compute/runtime/IMemoryManager.h"
-
#include "arm_compute/runtime/ILifetimeManager.h"
#include "arm_compute/runtime/IMemoryGroup.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IPoolManager.h"
#include <memory>
@@ -39,7 +38,8 @@ class MemoryManagerOnDemand : public IMemoryManager
{
public:
/** Default Constructor */
- MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager);
+ MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+ std::shared_ptr<IPoolManager> pool_manager);
/** Prevent instances of this class to be copy constructed */
MemoryManagerOnDemand(const MemoryManagerOnDemand &) = delete;
/** Prevent instances of this class to be copied */
@@ -52,12 +52,12 @@ public:
// Inherited methods overridden:
ILifetimeManager *lifetime_manager() override;
IPoolManager *pool_manager() override;
- void populate(IAllocator &allocator, size_t num_pools) override;
- void clear() override;
+ void populate(IAllocator &allocator, size_t num_pools) override;
+ void clear() override;
private:
std::shared_ptr<ILifetimeManager> _lifetime_mgr; /**< Lifetime manager */
std::shared_ptr<IPoolManager> _pool_mgr; /**< Memory pool manager */
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H */
diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h
index 6408deceaa..f8a4898281 100644
--- a/arm_compute/runtime/MemoryRegion.h
+++ b/arm_compute/runtime/MemoryRegion.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
#define ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
-#include "arm_compute/runtime/IMemoryRegion.h"
-
#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IMemoryRegion.h"
#include <cstddef>
@@ -41,21 +40,17 @@ public:
* @param[in] size Region size
* @param[in] alignment Alignment in bytes of the base pointer. Defaults to 0
*/
- MemoryRegion(size_t size, size_t alignment = 0)
- : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+ MemoryRegion(size_t size, size_t alignment = 0) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
{
- if(size != 0)
+ if (size != 0)
{
// Allocate backing memory
size_t space = size + alignment;
- _mem = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr)
- {
- delete[] ptr;
- });
- _ptr = _mem.get();
+ _mem = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr) { delete[] ptr; });
+ _ptr = _mem.get();
// Calculate alignment offset
- if(alignment != 0)
+ if (alignment != 0)
{
void *aligned_ptr = _mem.get();
std::align(alignment, size, aligned_ptr, space);
@@ -63,10 +58,9 @@ public:
}
}
}
- MemoryRegion(void *ptr, size_t size)
- : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+ MemoryRegion(void *ptr, size_t size) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
{
- if(size != 0)
+ if (size != 0)
{
_ptr = ptr;
}
@@ -91,7 +85,7 @@ public:
}
std::unique_ptr<IMemoryRegion> extract_subregion(size_t offset, size_t size) final
{
- if(_ptr != nullptr && (offset < _size) && (_size - offset >= size))
+ if (_ptr != nullptr && (offset < _size) && (_size - offset >= size))
{
return std::make_unique<MemoryRegion>(static_cast<uint8_t *>(_ptr) + offset, size);
}
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
index 5637d831a3..7971168d24 100644
--- a/arm_compute/runtime/NEON/INEOperator.h
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -24,11 +24,11 @@
#ifndef ARM_COMPUTE_INEOPERATOR_H
#define ARM_COMPUTE_INEOPERATOR_H
-#include "../../core/ITensor.h"
#include "arm_compute/runtime/IOperator.h"
#include "arm_compute/runtime/IRuntimeContext.h"
#include "arm_compute/runtime/Types.h"
+#include "../../core/ITensor.h"
#include <memory>
namespace arm_compute
@@ -60,8 +60,8 @@ public:
~INEOperator();
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
MemoryRequirements workspace() const override;
protected:
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
index 7512759bd0..f783a836ee 100644
--- a/arm_compute/runtime/NEON/INESimpleFunction.h
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -57,5 +57,5 @@ protected:
std::unique_ptr<INEKernel> _kernel; /**< Kernel to run */
std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_INESIMPLEFUNCTION_H */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 863a8a6412..cc4d303202 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEFUNCTIONS_H
-#define ARM_COMPUTE_NEFUNCTIONS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
@@ -38,6 +39,7 @@
#include "arm_compute/runtime/NEON/functions/NECast.h"
#include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NECopy.h"
@@ -60,39 +62,41 @@
#include "arm_compute/runtime/NEON/functions/NEFloor.h"
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/NEON/functions/NEGather.h"
#include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h"
#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
#include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
-#include "arm_compute/runtime/NEON/functions/NELogical.h"
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
#include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPermute.h"
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h"
#include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
#include "arm_compute/runtime/NEON/functions/NERange.h"
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
#include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
#include "arm_compute/runtime/NEON/functions/NEScale.h"
#include "arm_compute/runtime/NEON/functions/NESelect.h"
#include "arm_compute/runtime/NEON/functions/NESlice.h"
@@ -107,4 +111,4 @@
#include "arm_compute/runtime/NEON/functions/NEUnstack.h"
#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
-#endif /* ARM_COMPUTE_NEFUNCTIONS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index a3082d00f6..613f44cc52 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -30,5 +30,5 @@ namespace arm_compute
{
/** CPU Scheduler */
using NEScheduler = Scheduler;
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_NESCHEDULER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index b39a8d7701..5584fdc783 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,9 @@
#ifndef ARM_COMPUTE_NEACTIVATIONLAYER_H
#define ARM_COMPUTE_NEACTIVATIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IRuntimeContext.h"
#include <memory>
@@ -101,5 +101,5 @@ private:
struct Impl;
std::unique_ptr<Impl> _impl;
};
-} // namespace arm_computes
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
new file mode 100644
index 0000000000..6c65c055dd
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD
+#define ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+class ActivationLayerInfo;
+
+/** Function to compute Add+Mul+Add fused operation */
+class NEAddMulAdd : public IFunction
+{
+public:
+ /** Constructor */
+ NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEAddMulAdd(const NEAddMulAdd &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ NEAddMulAdd(NEAddMulAdd &&) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEAddMulAdd &operator=(const NEAddMulAdd &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ NEAddMulAdd &operator=(NEAddMulAdd &&) = delete;
+ /** Destructor */
+ ~NEAddMulAdd();
+ /** Initialize the function's inputs and outputs.
+ *
+ * Valid data layouts:
+ * - Any
+ *
+ * Valid data type configurations:
+ * |input1 |input2 |bn_mul |bn_add |add_output |final_output |
+ * |:--------------|:--------------|:--------------|:--------------|:--------------|:--------------|
+ * |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ * |F16 |F16 |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |F32 |F32 |
+ *
+ * This is what this composite function (tailored for add followed by a batch norm operation) does:
+ * add_output <- input1 + input2 (add)
+ * final_output <- add_output * bn_mul + bn_add (batch norm = mul+add)
+ *
+ * @param[in] input1 First tensor input. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] input2 Second tensor input. Data types supported: Same as @p input.
+ * @param[in] bn_mul The multiplication coefficient on the feature dimension. Data types supported: Same as @p input.
+ * It's one dimensional tensor with size equal to the feature maps [FM]
+ * @param[in] bn_add The addition coefficient on the feature dimension. Data types supported: Same as @p input.
+ * It's one dimensional tensor with size equal to the feature maps [FM]
+ * @param[out] add_output Output of the first add. Data type supported: Same as @p input.
+ * @param[out] final_output Output of the add+mul+add+act composite operation. Data type supported: Same as @p input.
+ * @param[in] policy Policy to handle overflow
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ *
+ */
+ void configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *bn_mul,
+ ITensor *bn_add,
+ ITensor *add_output,
+ ITensor *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEAddMulAdd
+ *
+ * Similar to @ref NEAddMulAdd::configure() except the arguments are @ref ITensorInfo * instead of @ref ITensor *
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD */
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 4392de7b28..3bb50a0f90 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,8 +24,6 @@
#ifndef ARM_COMPUTE_NEARGMINMAXLAYER_H
#define ARM_COMPUTE_NEARGMINMAXLAYER_H
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/INESimpleFunction.h"
@@ -33,7 +31,6 @@
namespace arm_compute
{
class ITensor;
-
/** Function to calculate the index of the minimum or maximum values in a
* tensor based on an axis.
*
@@ -68,13 +65,13 @@ public:
* - All
*
* Valid data type configurations:
- * |src |dst |
- * |:--------------|:----------|
- * |QASYMM8 |U32, S32 |
- * |QASYMM8_SIGNED |U32, S32 |
- * |S32 |U32, S32 |
- * |F16 |U32, S32 |
- * |F32 |U32, S32 |
+ * |src |dst |
+ * |:--------------|:-------------|
+ * |QASYMM8 |U32, S32 |
+ * |QASYMM8_SIGNED |U32, S32 |
+ * |S32 |U32, S32, S64 |
+ * |F16 |U32, S32 |
+ * |F32 |U32, S32 |
*
* @param[in] input Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
* @param[in] axis Axis to find max/min index.
@@ -86,7 +83,7 @@ public:
*
* @param[in] input Input source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
* @param[in] axis Axis to find max/min index.
- * @param[in] output Output source tensor info. Data types supported: U32/S32.
+ * @param[in] output Output source tensor info. Data types supported: U32/S32/S64.
* @param[in] op Operation to perform: min or max
*
* @return a status
@@ -97,7 +94,8 @@ public:
void run() override;
private:
- std::unique_ptr<NEReductionOperation> _reduction_function;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEARGMINMAXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index b8e46ff36e..73a43dbc44 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,9 @@
#define ARM_COMPUTE_NEARITHMETICADDITION_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
@@ -62,9 +64,6 @@ public:
* |QSYMM16 |QSYMM16 |QASYMM16 |
* |QSYMM16 |QSYMM16 |S32 |
* |U8 |U8 |U8 |
- * |U8 |U8 |S16 |
- * |U8 |S16 |S16 |
- * |S16 |U8 |S16 |
* |S16 |S16 |S16 |
* |S32 |S32 |S32 |
* |F16 |F16 |F16 |
@@ -76,7 +75,11 @@ public:
* @param[in] policy Policy to use to handle overflow.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
*
* @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
@@ -87,7 +90,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 0c72e946f6..3e4f6356c5 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/NEON/INEOperator.h"
@@ -68,9 +69,6 @@ public:
* |QSYMM16 |QSYMM16 |QASYMM16 |
* |QSYMM16 |QSYMM16 |S32 |
* |U8 |U8 |U8 |
- * |U8 |U8 |S16 |
- * |U8 |S16 |S16 |
- * |S16 |U8 |S16 |
* |S16 |S16 |S16 |
* |S32 |S32 |S32 |
* |F16 |F16 |F16 |
@@ -82,7 +80,11 @@ public:
* @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
*
* @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
@@ -93,7 +95,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index ec00fbdbf2..99e2dcadbb 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -81,7 +81,13 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+ void configure(ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta = nullptr,
+ const ITensor *gamma = nullptr,
+ float epsilon = 0.001f,
ActivationLayerInfo act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer
*
@@ -98,10 +104,14 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
- float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta = nullptr,
+ const ITensorInfo *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -109,5 +119,5 @@ public:
private:
std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index 810bf81a22..ebed0bea29 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NEBATCHTOSPACELAYER_H
#define ARM_COMPUTE_NEBATCHTOSPACELAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
namespace arm_compute
@@ -64,7 +63,10 @@ public:
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
* @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
* @param[out] output Tensor output. Data types supported: same as @p input
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
+ ARM_COMPUTE_DEPRECATED_REL(23.05)
void configure(const ITensor *input, const ITensor *block_shape, ITensor *output);
/** Set the input and output tensors. (Static block shape).
*
@@ -72,8 +74,13 @@ public:
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[out] output Tensor output. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*/
- void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output);
+ void configure(const ITensor *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ ITensor *output,
+ const CropInfo &crop_info = CropInfo{});
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -81,7 +88,9 @@ public:
* @param[out] output Tensor output info. Data types supported: same as @p input
*
* @return a status
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
*/
+ ARM_COMPUTE_DEPRECATED_REL(23.05)
static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer (Static block shape).
*
@@ -89,10 +98,15 @@ public:
* @param[in] block_shape_x Block shape x value.
* @param[in] block_shape_y Block shape y value.
* @param[out] output Tensor output info. Data types supported: same as @p input
+ * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info = CropInfo{});
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEBATCHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index 2a196a2de5..aa41fc0df2 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -57,7 +57,8 @@ public:
*
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*/
- void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+ void
+ configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref NEBoundingBoxTransform
*
@@ -71,7 +72,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+ static Status validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index 30499f5ecf..43cae777f6 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NECAST_H
#define ARM_COMPUTE_NECAST_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -85,7 +84,7 @@ public:
*
* @return a status
*/
- static Status validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy);
// Inherited methods overridden
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
index 8888efec4f..bc19e1a4af 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
@@ -46,6 +46,7 @@ public:
*
* Valid data layouts:
* - NCHW
+ * - NHWC
*
* Valid data type configurations:
* |src |dst |
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index dd1c709d76..1600f85488 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NECONCATENATELAYER_H
#define ARM_COMPUTE_NECONCATENATELAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -87,7 +86,8 @@ public:
*
* @return a status
*/
- static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+ static Status
+ validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConv3D.h b/arm_compute/runtime/NEON/functions/NEConv3D.h
new file mode 100644
index 0000000000..525f37f3e7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEConv3D.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECONV3D_H
+#define ARM_COMPUTE_NECONV3D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to simulate a 3d convolution. This function calls one of the following functions:
+ * -# @ref cpu::CpuDirectConv3d
+ *
+ */
+class NEConv3D : public IFunction
+{
+public:
+ /** Constructor */
+ NEConv3D();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEConv3D(const NEConv3D &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEConv3D &operator=(const NEConv3D &) = delete;
+ /** Default move constructor */
+ NEConv3D(NEConv3D &&) = default;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ NEConv3D &operator=(NEConv3D &&) = default;
+ /** Default destructor */
+ ~NEConv3D();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NDHWC
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] input Source tensor. 4 lower dimensions represent a single input [IFM, width, height, depth],
+ * while every optional dimension from 5 and above represent a batch of inputs.
+ * @param[in] weights Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_x, kernel_y, kernel_z].
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * @param[out] output Destination tensor. 4 lower dimensions represent a single output [OFM, width, height, depth], while the rest represent batch of outputs.
+ * @param[in] conv_info Contains padding, stride, acitvation information described in @ref Conv3dInfo.
+ */
+ void configure(
+ ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to NEConv3D::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const Conv3dInfo &conv_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECONV3D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index 218877d421..dc6b22d717 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -24,14 +24,14 @@
#ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
#define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/ITransformWeights.h"
-#include "arm_compute/runtime/Tensor.h"
namespace arm_compute
{
// Forward declarations
class ITensor;
+class ITensorInfo;
/** Basic function to run @ref cpu::kernels::CpuConvertFullyConnectedWeightsKernel. */
class NEConvertFullyConnectedWeights : public IFunction
@@ -65,7 +65,8 @@ public:
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
*/
- void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+ void
+ configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
/** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeights
*
* @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -75,7 +76,10 @@ public:
*
* @return A Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout);
// Inherited methods overriden:
void run() override;
@@ -84,45 +88,5 @@ private:
struct Impl;
std::unique_ptr<Impl> _impl;
};
-
-namespace weights_transformations
-{
-/** Basic function to manage @ref NEConvertFullyConnectedWeights. */
-class NEConvertFullyConnectedWeightsManaged : public ITransformWeights
-{
-public:
- void run() override
- {
- _output.allocator()->allocate();
- _func.run();
- _reshape_run = true;
- }
-
- void release() override
- {
- _output.allocator()->free();
- }
-
- ITensor *get_weights() override
- {
- return &_output;
- }
-
- uint32_t uid() override
- {
- return _uid;
- }
-
- void configure(const ITensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
- {
- _func.configure(input, &_output, original_input_shape, data_layout);
- }
-
-private:
- static constexpr uint32_t _uid = 0x4;
- Tensor _output{};
- NEConvertFullyConnectedWeights _func{};
-};
-} // namespace weights_transformations
} // namespace arm_compute
#endif /* ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index f19aa8008b..2d07980ade 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,13 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NECONVOLUTIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include <memory>
@@ -38,9 +38,9 @@ namespace arm_compute
class ITensor;
/** Basic function to simulate a convolution layer. This function calls one of the following functions:
- * -# @ref NEGEMMConvolutionLayer (executed only in case GEMM is required for the operation)
- * -# @ref NEWinogradConvolutionLayer (executed only in case Winograd is required for the operation)
- * -# @ref NEDirectConvolutionLayer (executed only in case Direct Convolution is required for the operation)
+ * -# @ref cpu::CpuGemmConv2d (executed only in case GEMM is required for the operation)
+ * -# @ref cpu::CpuWinogradConv2d (executed only in case Winograd is required for the operation)
+ * -# @ref cpu::CpuDirectConv2d (executed only in case Direct Convolution is required for the operation)
* -# @ref NEFFTConvolutionLayer (executed only in case FFT is required for the operation)
*
*
@@ -78,12 +78,12 @@ public:
NEConvolutionLayer(const NEConvolutionLayer &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEConvolutionLayer &operator=(const NEConvolutionLayer &) = delete;
+ /** Default move constructor */
+ NEConvolutionLayer(NEConvolutionLayer &&) = default;
/** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayer(NEConvolutionLayer &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayer &operator=(NEConvolutionLayer &&) = delete;
+ NEConvolutionLayer &operator=(NEConvolutionLayer &&) = default;
/** Default destructor */
- ~NEConvolutionLayer() = default;
+ ~NEConvolutionLayer();
/** Set the input and output tensors.
*
* Valid data layouts:
@@ -111,15 +111,23 @@ public:
* Data types supported: Same as @p input.
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
- * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ void configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -133,7 +141,7 @@ public:
* Data types supported: Same as @p input.
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
- * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
@@ -142,9 +150,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
- unsigned int num_groups = 1);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -156,7 +171,7 @@ public:
* Data types supported: Same as @p input.
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
- * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
@@ -164,15 +179,21 @@ public:
*
* @return the Convolution Method Hint
*/
- static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
void prepare() override;
private:
- std::shared_ptr<IMemoryManager> _memory_manager;
- std::unique_ptr<IFunction> _function; /**< Function to run */
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index ee02c259f4..840c03e968 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NECOPY_H
#define ARM_COMPUTE_NECOPY_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 143bbbc6f1..f806762158 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h
@@ -75,8 +75,13 @@ public:
* @param[in] method The policy to be used when resizing image. Default is bilinear.
* @param[in] extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
*/
- void configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
- InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+ void configure(const ITensor *input,
+ const ITensor *boxes,
+ const ITensor *box_ind,
+ ITensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method = InterpolationPolicy::BILINEAR,
+ float extrapolation_value = 0);
/** Static function to check if given info will lead to a valid configuration of @ref NESlice
*
@@ -96,8 +101,13 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
- Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *boxes,
+ const ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value);
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 34ab0707c2..aabe42f928 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,15 +24,14 @@
#ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
#define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
-#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReverse.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
@@ -76,17 +75,16 @@ class NEDeconvolutionLayer : public IFunction
public:
/** Constructor */
NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete;
+ /** Default move constructor */
+ NEDeconvolutionLayer(NEDeconvolutionLayer &&) = default;
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEDeconvolutionLayer &operator=(const NEDeconvolutionLayer &) = delete;
- /** Prevent instances of this class from being moved (As this class contains pointers) */
- NEDeconvolutionLayer(NEDeconvolutionLayer &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains pointers) */
- NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = delete;
+ /** Default move assignment operator */
+ NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = default;
/** Default destructor */
- virtual ~NEDeconvolutionLayer() = default;
+ ~NEDeconvolutionLayer() = default;
/** Set the input, weights, biases and output tensors.
*
@@ -104,25 +102,50 @@ public:
* |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
* |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
*
- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
- * @param[out] output Output tensor. The output has the same number of dimensions as the @p input.
- * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
+ * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] bias Optional, ignored if NULL. The biases have one dimension.
+ * Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+ * @param[out] output Output tensor. The output has the same number of dimensions as the @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] weights_info (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+ * the GEMM convolution.
*
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info);
+ void configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &info,
+ bool enable_fast_math = false,
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer
*
- * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
- * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
- * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
- * @param[in] output Output tensor info. The output has the same number of dimensions as the @p input.
- * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
+ * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+ * @param[in] output Output tensor info. The output has the same number of dimensions as the @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] weights_info (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+ * the GEMM convolution.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &info,
+ bool enable_fast_math = false,
+ const WeightsInfo &weights_info = WeightsInfo());
// Inherited methods overridden:
void run() override;
@@ -140,6 +163,7 @@ private:
ITensor *_input;
PadStrideInfo _info;
bool _is_prepared;
+ bool _do_upsampling;
};
-} // arm_compute
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index eb0724ae12..7bfdfbd13d 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NEDEPTHCONVERT_H
#define ARM_COMPUTE_NEDEPTHCONVERT_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -84,7 +83,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
// Inherited methods overridden
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index b9bdcd1f11..d27369670e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,26 +21,27 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include <memory>
namespace arm_compute
{
// Forward declarations
class ITensor;
class ITensorInfo;
+class NEDepthToSpaceLayerKernel;
/** Basic function to run @ref NEDepthToSpaceLayerKernel. */
-class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
+class NEDepthToSpaceLayer : public IFunction
{
public:
/** Constructor */
- NEDepthToSpaceLayer() = default;
+ NEDepthToSpaceLayer();
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -50,7 +51,7 @@ public:
/** Prevent instances of this class from being moved (As this class contains non movable objects) */
NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete;
/** Default destructor */
- ~NEDepthToSpaceLayer() = default;
+ ~NEDepthToSpaceLayer();
/** Set the input and output tensors.
*
* Valid data layouts:
@@ -76,6 +77,11 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+ void run() override;
+
+private:
+ std::unique_ptr<NEDepthToSpaceLayerKernel> _kernel;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 2f541758f4..6ad5aa7bfa 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -28,6 +28,7 @@
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
#include <memory>
namespace arm_compute
@@ -80,8 +81,14 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ void configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
/** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer
*
@@ -98,8 +105,14 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
// Inherited methods overriden:
void run() override;
@@ -112,7 +125,7 @@ private:
*
* -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
* -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present
- * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+ * -# @ref cpu::CpuDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
* -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
* -# @ref NEActivationLayer if fused activation is required
*
@@ -127,9 +140,11 @@ private:
/** Default move constructor */
NEDepthwiseConvolutionLayerOptimizedInternal(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
/** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
+ NEDepthwiseConvolutionLayerOptimizedInternal &
+ operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
/** Default move assignment operator */
- NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
+ NEDepthwiseConvolutionLayerOptimizedInternal &
+ operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
/** Default destructor */
~NEDepthwiseConvolutionLayerOptimizedInternal() = default;
/** Initialize the function's source, destination, kernels and border_size.
@@ -144,8 +159,14 @@ private:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ void configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
/** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
*
@@ -161,8 +182,14 @@ private:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
// Inherited methods overriden:
void run() override;
@@ -207,8 +234,14 @@ private:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ void configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
/** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerGeneric
*
@@ -225,8 +258,14 @@ private:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1U, 1U));
// Inherited methods overriden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index 2affa8d49e..7a94833d10 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -24,13 +24,12 @@
#ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
#define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
#include "arm_compute/runtime/Tensor.h"
#include <map>
@@ -78,8 +77,14 @@ public:
*
* @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
*/
- void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
- ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+ void configure(const ITensor *input_box_encoding,
+ const ITensor *input_score,
+ const ITensor *input_anchors,
+ ITensor *output_boxes,
+ ITensor *output_classes,
+ ITensor *output_scores,
+ ITensor *num_detection,
+ DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer
*
* @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
@@ -93,8 +98,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
- ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+ static Status validate(const ITensorInfo *input_box_encoding,
+ const ITensorInfo *input_class_score,
+ const ITensorInfo *input_anchors,
+ ITensorInfo *output_boxes,
+ ITensorInfo *output_classes,
+ ITensorInfo *output_scores,
+ ITensorInfo *num_detection,
DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 70352fdfaa..3ae3b2a15c 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
@@ -84,7 +85,12 @@ public:
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
*
* @note: DirectConvolution only works in the following configurations:
@@ -105,7 +111,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index 95274bdb0c..ebf2277d1f 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/NEON/INEOperator.h"
@@ -72,7 +73,10 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
*
* @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -82,7 +86,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -132,7 +139,10 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
*
* @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -142,7 +152,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -192,7 +205,10 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
*
* @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -202,7 +218,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -248,7 +267,10 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
*
* @param[in] input1 First tensor input info. Data types supported: F16/F32.
@@ -258,7 +280,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -305,7 +330,10 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
*
* @param[in] input1 First tensor input info. Data types supported: F16/F32.
@@ -315,7 +343,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
@@ -376,7 +407,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+ static Status
+ validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index 9654b1e604..99c6fd4eb4 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NEFFT1D_H
#define ARM_COMPUTE_NEFFT1D_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index 57f38d1942..cefd3df17a 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NEFFT2D_H
#define ARM_COMPUTE_NEFFT2D_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
#include "arm_compute/runtime/Tensor.h"
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index c5f4d45b6b..84bfe6b02f 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
#define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
@@ -94,8 +93,13 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ void configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
/** Static function to check if given info will lead to a valid configuration of @ref NEFFTConvolutionLayer
*
* @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -113,8 +117,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index e923ce33e1..1829c71fef 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h
@@ -24,10 +24,9 @@
#ifndef ARM_COMPUTE_NEFILL_H
#define ARM_COMPUTE_NEFILL_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index ab77c28839..44b1d4a62b 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -27,6 +27,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
@@ -57,7 +58,10 @@ public:
* @param[in] border_mode Strategy to use for borders.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*/
- void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(ITensor *input,
+ unsigned int border_width,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index 4d47b068db..77ac484bab 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NEFLOOR_H
#define ARM_COMPUTE_NEFLOOR_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 9727e108a5..885f8430cf 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,16 +24,15 @@
#ifndef ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H
#define ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/IWeightsManager.h"
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
#include "arm_compute/runtime/Tensor.h"
+#include <memory>
+
namespace arm_compute
{
namespace weights_transformations
@@ -77,10 +76,10 @@ private:
} // namespace weights_transformations
/** Basic function to compute a Fully Connected layer. This function calls the following kernels:
- * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ * -# @ref cpu::kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer)
* -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
- * -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is not equal to nullptr)
+ * -# @ref NEGEMM or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
*
* @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
*/
@@ -88,7 +87,8 @@ class NEFullyConnectedLayer : public IFunction
{
public:
/** Constructor */
- NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+ NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr,
+ IWeightsManager *weights_manager = nullptr);
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete;
/** Prevent instances of this class from being moved (As this class contains pointers) */
@@ -113,66 +113,65 @@ public:
* |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
* |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
*
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. The weights must be 2 dimensional.
- * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
- * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
- * Data type supported: Same as @p input.
- * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
- * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
- * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
- * Data type supported: Same as @p input.
- * @param[in] fc_info (Optional) Fully connected layer additional info
+ * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor. The weights must be 2 dimensional.
+ * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+ * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+ * Data type supported: Same as @p input.
+ * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
+ * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+ * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+ * Data type supported: Same as @p input.
+ * @param[in] fc_info (Optional) Fully connected layer additional info
+ * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped
*/
- void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
- FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+ void configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor info. The weights must be 2 dimensional.
- * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
- * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
- * Data type supported: Same as @p input.
- * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
- * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
- * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
- * Data type supported: Same as @p input.
- * @param[in] fc_info (Optional) Fully connected layer additional info
+ * Similar to @ref NEFullyConnectedLayer::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
+ const WeightsInfo &weights_info = WeightsInfo());
+
+ /** Static function that queries whether fixed-format kernel exists for a given problem description
+ *
+ * @param[out] expected_weight_format Format in which weights should be for found fixed format kernel
+ * @param[in] input Source tensor
+ * @param[in] weights Weights tensor.
+ * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] output Destination tensor
+ * @param[in] fc_info Fully connected layer additional info
+ * @param[in] weights_info Describes weights shape
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+ static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const FullyConnectedLayerInfo &fc_info,
+ const WeightsInfo &weights_info);
//Inherited methods override
void run() override;
void prepare() override;
private:
- void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
- void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
- void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
-
- MemoryGroup _memory_group;
- IWeightsManager *_weights_manager;
- NEFlattenLayer _flatten;
- NEConvertFullyConnectedWeights _convert_weights;
- weights_transformations::NEConvertFullyConnectedWeightsManaged _convert_weights_managed;
- NETranspose _reshape_weights_function;
- weights_transformations::NEFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function;
- NEGEMM _mm_gemm;
- NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
- Tensor _flatten_output;
- Tensor _converted_weights_output;
- Tensor _reshape_weights_output;
- const ITensor *_original_weights;
- bool _are_weights_converted;
- bool _are_weights_reshaped;
- bool _is_fc_after_conv;
- bool _is_quantized_asymmetric;
- bool _is_prepared;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index 3dd7f49044..f53b3de7f6 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
@@ -75,9 +75,16 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to Convolution.
*/
- void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const ITensor *input_weights,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *input_bias = nullptr,
+ const ITensor *bn_beta = nullptr,
+ const ITensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalization
*
* @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -95,10 +102,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ static Status validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias = nullptr,
+ const ITensorInfo *bn_beta = nullptr,
+ const ITensorInfo *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 6fa30bd545..29650a5eca 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,44 +24,18 @@
#ifndef ARM_COMPUTE_NEGEMM_H
#define ARM_COMPUTE_NEGEMM_H
-#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/function_info/GEMMInfo.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
-#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace arm_compute
{
-// Forward declarations
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMMatrixAdditionKernel;
-class NEGEMMMatrixMultiplyKernel;
-class NEGEMMTranspose1xWKernel;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
-
/** Basic function to execute GEMM. This function calls the following kernels:
*
- * If optimized assembly is available:
- * -# @ref cpu::CpuGemmAssemblyDispatch
- * -# @ref NEActivationLayer (if alpha != 1.0)
- * Else:
- * -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix)
- * -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix)
- * -# @ref NEGEMMMatrixMultiplyKernel
- * In both cases:
- * -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once)
- * Else:
- * -# @ref NEArithmeticAddition (if c != nullptr and is reshaped once and not optimized assembly in place)
- *
- * -# @ref NEActivationLayer (if activation is specified in GEMMInfo)
+ * -# @ref cpu::CpuGemm
*/
class NEGEMM : public IFunction
{
@@ -93,6 +67,8 @@ public:
* @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
* @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
*
+ * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+ *
* @param[in] a First input tensor (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32
* @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
* @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
@@ -102,51 +78,49 @@ public:
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should happen only for the first run
*/
- void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const ITensor *a,
+ const ITensor *b,
+ const ITensor *c,
+ ITensor *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
*
- * @param[in] a First input tensor info (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32
- * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a.
- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
- * @param[out] output Output tensor info. Data type supported: same as @p a
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of matrix C
- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
- * if the reshape of matrix B should happen only for the first run
+ * Similar to @ref NEGEMM::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
+
+ /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
+ * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
+ * as in @ref NEGEMM::validate() except that all arguments are required.
+ *
+ * @return a status
+ */
+ static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden:
void run() override;
void prepare() override;
private:
- MemoryGroup _memory_group;
- IWeightsManager *_weights_manager;
- std::unique_ptr<NEGEMMInterleave4x4Kernel> _interleave_kernel;
- std::unique_ptr<NEGEMMTranspose1xWKernel> _transpose_kernel;
- std::unique_ptr<NEGEMMMatrixMultiplyKernel> _mm_kernel;
- std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _asm_glue;
- std::unique_ptr<NEGEMMMatrixAdditionKernel> _ma_kernel;
- NEActivationLayer _alpha_scale_func;
- NEArithmeticAddition _add_bias;
- NEActivationLayer _activation_func;
-
- Tensor _tmp_a;
- Tensor _tmp_b;
- Tensor _tmp_d;
- const ITensor *_original_b;
- bool _run_vector_matrix_multiplication;
- bool _run_alpha_scale;
- bool _run_addition;
- bool _run_bias_addition;
- bool _run_activation;
- bool _reshape_b_only_on_first_run;
- bool _is_prepared;
-
- ITensorPack _asm_glue_tensors{};
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEGEMM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index f39ce4dfa3..d1c5a1c9b3 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -29,15 +29,12 @@
#include "arm_compute/runtime/IMemoryManager.h"
#include <memory>
+
namespace arm_compute
{
// Forward declarations
class ITensor;
class ITensorInfo;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
*
@@ -89,7 +86,8 @@ public:
* Data types supported: Same as @p input.
* @param[in] info Convolution layer descriptor
*/
- void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+ void
+ configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
*
* @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -105,7 +103,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const Conv2dInfo &info);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e89eae1d31..3e84c3e2cf 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,147 +24,31 @@
#ifndef ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
#define ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IWeightsManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace arm_compute
{
class ITensor;
-class NECol2ImKernel;
-class NEIm2ColKernel;
-class NEWeightsReshapeKernel;
-
-/** Function to reshape the weights. This function calls the following kernel:
- * -# @ref NEWeightsReshapeKernel
- */
-class NEConvolutionLayerReshapeWeights : public IFunction
-{
-public:
- /** Constructor */
- NEConvolutionLayerReshapeWeights() noexcept;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeights(const NEConvolutionLayerReshapeWeights &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeights &operator=(const NEConvolutionLayerReshapeWeights &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = delete;
- /** Default destructor */
- ~NEConvolutionLayerReshapeWeights();
- /** Set the input and output tensors.
- *
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: All.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: same as @p weights.
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[out] output Destination tensor. Data types supported: same as @p weights.
- */
- void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
- *
- * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: All.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: same as @p weights.
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[in] output Destination tensor. Data types supported: same as @p weights.
- *
- * @return an error status
- */
- static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- std::unique_ptr<NEWeightsReshapeKernel> _weights_reshape_kernel;
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref NEConvolutionLayerReshapeWeights */
-class NEConvolutionLayerReshapeWeightsTransform : public ITransformWeights
-{
-public:
- /** Constructor */
- NEConvolutionLayerReshapeWeightsTransform() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeightsTransform(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeightsTransform &operator=(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeightsTransform(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeightsTransform &operator=(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
- /** Default destructor */
- ~NEConvolutionLayerReshapeWeightsTransform() = default;
- void configure(const ITensor *input, const ITensor *biases)
- {
- _bias_bit = (biases != nullptr) ? 1 : 0;
- _func.configure(input, biases, &_output);
- }
-
- void run() override
- {
- _output.allocator()->allocate();
- _func.run();
- _reshape_run = true;
- }
-
- ITensor *get_weights() override
- {
- return &_output;
- }
-
- void release() override
- {
- _output.allocator()->free();
- }
-
- uint32_t uid() override
- {
- return ((0x8) | (_bias_bit << 7));
- }
-
- bool is_reshape_run()
- {
- return _reshape_run;
- }
-
-private:
- Tensor _output{};
- NEConvolutionLayerReshapeWeights _func{};
- int32_t _bias_bit{ 0 };
-};
-} // namespace weights_transformations
+class ITensorInfo;
/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
*
- * -# @ref NEIm2ColKernel
- * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
- * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEArithmeticAddition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
- * -# @ref NECol2ImKernel (if NCHW data layout)
+ * -# @ref cpu::CpuGemmConv2d
*
*/
class NEGEMMConvolutionLayer : public IFunction
{
public:
/** Constructor */
- NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+ NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr,
+ IWeightsManager *weights_manager = nullptr);
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete;
/** Prevent instances of this class from being moved (As this class contains non movable objects) */
@@ -192,118 +76,139 @@ public:
* |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
* |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
*
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
- * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*/
- void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+ void configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
*
- * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
- * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
- * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+ * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
-
- // Inherited methods overridden:
- void run() override;
- void prepare() override;
-
-private:
- /** Configures the appropriate matrix multiply routine
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
+
+ /** Static function to check if there is an optimized version of
+ * GEMM available for the input parameters.
*
- * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights Weights tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[out] output Output tensor. Data types supported: Same as @p input,
- * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
- */
- void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), int gemm_3d_depth = 1);
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+ * The method is intended to be used to find out the optimal
+ * memory layout to be used for the weights tensor when running
+ * variable weights execution.
*
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
- * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[in] output Output tensor info. Data types supported: Same as @p input,
- * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
- * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+ * The user can query the database of optimised kernels in
+ * arm_gemm by specifying one of the enumerations of
+ * arm_compute::WeightFormat in the weight_format field of the input
+ * parameter weights_info. In case of success, the method
+ * writes the expected format in the output parameter
+ * expected_weight_format. The expected_weight_format can than be
+ * used in the configure method of the class for retrieving the
+ * best optimal kernel.
*
- * @return a status
- */
- static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- int gemm_3d_depth = 1, bool skip_im2col = false);
- /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref NEGEMMLowpMatrixMultiplyCore
+ * Use case one - query for a specific format:
*
- * @param[in] input_info Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights_info Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] gemm_3d_depth Depth of GEMM 3D
- * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
+ * WeightInfo weights_info(..., arm_compute::WeightFormat::OHWIo4, ...); // Set the value of the input query.
+ * if (NEGEMMConvolutionlayer::has_opt_impl(WeightFormat(), ...., weights_info, ...))
+ * {
+ * auto conv = std::unique_ptr<NEGEMMConvolutionlayer>();
+ * conv->configure(..., weights_info, ...); // uses the same WeightFormat the user wanted originally, OHWYo4.
+ * conv->run(...);
+ * }
*
- * @return a status
+ * Use case two - query for any format that would be optimal for the GEMM to execute:
+ *
+ * WeightInfo weights_info(..., arm_compute::WeightFormat::ANY, ...); // Set the value of the input query.
+ * arm_compute::WeightFormat expected_wf;
+ * if (NEGEMMConvolutionlayer::has_opt_impl(expected_wf, ...., weights_info, ...))
+ * {
+ * auto conv = std::unique_ptr<NEGEMMConvolutionlayer>();
+ * // ... code to convert the layout of the weights tensor to the layout returned by has_opt_impl
+ * WeightInfo new_weights_info(..., expected_wf, ...); // Set the value of the WeightFormat returned by has_opt_impl.
+ * conv->configure(..., new_weights_info, ...);
+ * conv->run(...);
+ * }
+ *
+ * Notice that a GEMM configured with a WeightFormat other than
+ * UNSPECIFIED will run GEMM with variable weights mode.
+ *
+ * @param[out] expected_weight_format The arm_compute::WeightFormat expected by the kernel.
+ * @param[in] src Source tensor info.
+ * @param[in] weights Weights tensor info.
+ * @param[in] biases Biases tensor info. Shared biases supported.
+ * @param[in] dst Destination tensor info.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info (optional) Specifies additional configuration parameters for the weights of the GEMM computation.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. And no activation (i.e. Linear) which is the default value.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ *
+ * @return a Status
*/
- static Status validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+ static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
private:
- MemoryGroup _memory_group;
- IWeightsManager *_weights_manager;
- NEConvolutionLayerReshapeWeights _reshape_weights;
- weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
- std::unique_ptr<NEIm2ColKernel> _im2col_kernel;
- NEGEMM _mm_gemm;
- NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
- std::unique_ptr<NECol2ImKernel> _col2im_kernel;
- NEReshapeLayer _reshape_layer;
-
- const ITensor *_original_weights;
- const ITensor *_original_output;
-
- Tensor _im2col_output;
- Tensor _weights_reshaped;
- Tensor _gemm_output;
- Tensor _gemm_output_3d;
- Tensor _tmp_output;
-
- DataLayout _data_layout;
-
- bool _skip_im2col;
- bool _skip_col2im;
- bool _is_quantized;
- bool _is_prepared;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H */
+#endif /* ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index dc9783f9eb..6d07675d3d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,53 +21,34 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
-#include "NEActivationLayer.h"
-#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/GEMMInfo.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace arm_compute
{
class ITensor;
-class NEConvertQuantizedSignednessKernel;
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMLowpMatrixMultiplyKernel;
-class NEGEMMLowpOffsetContributionKernel;
-class NEGEMMLowpOffsetContributionOutputStageKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-class NEGEMMLowpMatrixBReductionKernel;
-class NEGEMMTranspose1xWKernel;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
+class ITensorInfo;
-/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
- *
- * -# @ref NEGEMMInterleave4x4Kernel
- * -# @ref NEGEMMTranspose1xWKernel
- * -# @ref NEGEMMLowpMatrixMultiplyKernel
- * -# @ref NEGEMMLowpOffsetContributionKernel
- * -# @ref NEActivationLayer
+/** Function to run Gemm on quantized types.
*
- * otherwise if the DOT product instruction is available:
+ * This function calls the following:
*
- * -# @ref NEGEMMLowpOffsetContributionKernel
- *
-*/
+ * -# @ref cpu::CpuGemmLowpMatrixMultiplyCore
+ */
class NEGEMMLowpMatrixMultiplyCore : public IFunction
{
public:
/** Constructor */
- NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+ NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr,
+ IWeightsManager *weights_manager = nullptr);
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
/** Default move constructor */
@@ -99,6 +80,7 @@ public:
* |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 |
* |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 |
* |QASYMM8_SIGNED |QSYMM8 |S32 |S32 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |F32 |F32 |
*
* @note GEMM_LOWP: low precision GEMM kernel
* This kernel performs the following computations:
@@ -107,71 +89,36 @@ public:
* -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
* -# Compute the matrix product of the resulting a * b in int32.
*
- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
+ * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise
*
* @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
* @param[in] b Second input tensor (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
- * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32
- * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
+ * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32/F32
+ * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should be executed only for the first run
*/
- void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(
+ const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
*
- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
- *
- * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
- * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
- * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
- * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
- * if the reshape of matrix B should be executed only for the first run
+ * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden
void run() override;
void prepare() override;
private:
- MemoryGroup _memory_group;
- IWeightsManager *_weights_manager;
- std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _asm_glue;
- std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel> _mm_kernel;
- std::unique_ptr<NEGEMMInterleave4x4Kernel> _mtx_a_reshape_kernel;
- std::unique_ptr<NEGEMMTranspose1xWKernel> _mtx_b_reshape_kernel;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _mtx_a_reduction_kernel;
- std::unique_ptr<NEGEMMLowpMatrixBReductionKernel> _mtx_b_reduction_kernel;
- std::unique_ptr<NEGEMMLowpOffsetContributionKernel> _offset_contribution_kernel;
- std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
- NEActivationLayer _activation_func;
- std::unique_ptr<NEConvertQuantizedSignednessKernel> _convert_to_signed_asymm;
- std::unique_ptr<NEConvertQuantizedSignednessKernel> _convert_from_signed_asymm;
-
- Tensor _vector_sum_col;
- Tensor _vector_sum_row;
- Tensor _tmp_a;
- Tensor _tmp_b;
- Tensor _mm_result_s32;
- Tensor _signed_a;
- Tensor _signed_output;
- const ITensor *_original_b;
- int32_t _a_offset;
- int32_t _b_offset;
-
- bool _run_vector_matrix_multiplication;
- bool _assembly_path;
- bool _fused_assembly_path;
- bool _reshape_b_only_on_first_run;
- bool _is_prepared;
- bool _fuse_output_stage;
- bool _run_activation;
- bool _flip_signedness;
-
- ITensorPack _asm_glue_tensors{};
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index fa5f5e3826..0d932bb4af 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -25,7 +25,7 @@
#define ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H
#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
/** This file contains all available output stages for GEMMLowp.
*
@@ -39,237 +39,17 @@ namespace arm_compute
{
class ITensor;
class ITensorInfo;
-
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint.
- *
- * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
- /** Constructor */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
- /** Default destructor */
- ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint();
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint.
- *
- * NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
- /** Constructor */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
- /** Default destructor */
- ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint();
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint.
- *
- * NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- * result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
- /** Constructor */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
- /** Default destructor */
- ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint();
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QSYMM16
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
- int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
- *
- * @param[in] input Input tensor info. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
/** Basic function to execute GEMMLowpQuantizeDown kernels.
*
- * This function calls the following kernels:
+ * This function calls the following operators:
*
- * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+ * -# @ref cpu::CpuGemmLowpOutputStage
*/
-class NEGEMMLowpOutputStage : public INESimpleFunctionNoBorder
+class NEGEMMLowpOutputStage : public IFunction
{
public:
/** Constructor */
- NEGEMMLowpOutputStage() = default;
+ NEGEMMLowpOutputStage();
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEGEMMLowpOutputStage(const NEGEMMLowpOutputStage &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -309,7 +89,17 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const GEMMLowpOutputStageInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h
index 393a38ee4d..9c7ae0134d 100644
--- a/arm_compute/runtime/NEON/functions/NEGather.h
+++ b/arm_compute/runtime/NEON/functions/NEGather.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,18 +49,17 @@ public:
* |All |All |
*
* @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All
- * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the following type: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
+ * @note The "axis" must be in the range [0, input.rank -1] when indices is a vector, and must be 1 when indices is a 2D or 3D tensor.
* @param[out] output Destination tensor. Data type supported: Same as @p input
* @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+ *
*/
void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
- /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: All
- * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
- * @param[in] output Destination tensor info. Data type supported: Same as @p input
- * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+ * Similar to @ref NEGather::configure()
*
* @return a status
*/
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 3b683382ec..0f294fde22 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -95,7 +95,12 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
* @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
*/
- void configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+ void configure(const ITensor *scores,
+ const ITensor *deltas,
+ const ITensor *anchors,
+ ITensor *proposals,
+ ITensor *scores_out,
+ ITensor *num_valid_proposals,
const GenerateProposalsInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref NEGenerateProposalsLayer
@@ -112,7 +117,11 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+ static Status validate(const ITensorInfo *scores,
+ const ITensorInfo *deltas,
+ const ITensorInfo *anchors,
+ const ITensorInfo *proposals,
+ const ITensorInfo *scores_out,
const ITensorInfo *num_valid_proposals,
const GenerateProposalsInfo &info);
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index bb0697072b..0bc57be09e 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -89,7 +89,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ float gamma = 1.0f,
+ float beta = 0.0f,
+ float epsilon = 1e-12f);
// Inherited methods overridden:
void run() override;
@@ -103,5 +107,5 @@ private:
Tensor _permuted_input;
Tensor _permuted_output;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index 7f1a5e785e..8502cee5d2 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -97,5 +97,5 @@ private:
std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel;
Tensor _sumsq;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index 075fb4530a..629c5d10a0 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -25,7 +25,8 @@
#define ARM_COMPUTE_NELSTMLAYER_H
#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -35,7 +36,6 @@
#include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
namespace arm_compute
{
@@ -104,13 +104,26 @@ public:
* @param[in] projection_threshold The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
* If set to 0.0 then clipping is disabled.
*/
- void configure(const ITensor *input,
- const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- const ITensor *output_state_in, const ITensor *cell_state_in,
- ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
- const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+ void configure(const ITensor *input,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ const ITensor *output_state_in,
+ const ITensor *cell_state_in,
+ ITensor *scratch_buffer,
+ ITensor *output_state_out,
+ ITensor *cell_state_out,
+ ITensor *output,
+ const LSTMParams<ITensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold = 0.f,
+ float projection_threshold = 0.f);
/** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
*
@@ -151,13 +164,26 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
- const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
- const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *scratch_buffer,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold = 0.f,
+ float projection_threshold = 0.f);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index 2f0c753691..ae951669b3 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -38,8 +39,6 @@
#include "arm_compute/runtime/NEON/functions/NESlice.h"
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
-
namespace arm_compute
{
// Forward declarations
@@ -50,7 +49,7 @@ class ITensor;
* This function calls the following functions/kernels:
*
* -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
+ * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
* -# @ref NETranspose Matrix transpose
* -# @ref NEConcatenateLayer Tensor concatenation
* -# @ref NEActivationLayer Activation functions (tanh and logistic)
@@ -104,11 +103,22 @@ public:
* @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
*/
void configure(const ITensor *input,
- const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- ITensor *cell_state_in, const ITensor *output_state_in,
- ITensor *cell_state_out, ITensor *output_state_out);
+ const ITensor *input_to_input_weights,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_input_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *input_gate_bias,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ ITensor *cell_state_in,
+ const ITensor *output_state_in,
+ ITensor *cell_state_out,
+ ITensor *output_state_out);
/** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
*
@@ -133,11 +143,22 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input,
- const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+ const ITensorInfo *input_to_input_weights,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_input_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *input_gate_bias,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out);
// Inherited methods overridden:
void run() override;
@@ -147,30 +168,30 @@ private:
MemoryGroup _memory_group;
// Functions used
- NEGEMMLowpMatrixMultiplyCore _gemmlowp;
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
- NETranspose _transpose_weights;
- NEConcatenateLayer _concat_input_weights;
- NEConcatenateLayer _concat_recurrent_weights;
- NEConcatenateLayer _concat_weights;
- NEConcatenateLayer _concat_inputs;
- NEConcatenateLayer _concat_bias;
- NEActivationLayer _sigmoid_forget_gate;
- NEActivationLayer _sigmoid_input_gate;
- NEActivationLayer _sigmoid_output_gate;
- NEActivationLayer _tanh_modulation_gate;
- NEActivationLayer _tanh_output_state;
- NEArithmeticAddition _add1;
- NEArithmeticAddition _add2;
- NEPixelWiseMultiplication _mul1;
- NEPixelWiseMultiplication _mul2;
- NEPixelWiseMultiplication _mul3;
- NESlice _slice_input_tensor;
- NESlice _slice_forget_tensor;
- NESlice _slice_cell_tensor;
- NESlice _slice_output_tensor;
- NEDequantizationLayer _dequantize;
- NEQuantizationLayer _quantize;
+ NEGEMMLowpMatrixMultiplyCore _gemmlowp;
+ NEGEMMLowpOutputStage _output_stage;
+ NETranspose _transpose_weights;
+ NEConcatenateLayer _concat_input_weights;
+ NEConcatenateLayer _concat_recurrent_weights;
+ NEConcatenateLayer _concat_weights;
+ NEConcatenateLayer _concat_inputs;
+ NEConcatenateLayer _concat_bias;
+ NEActivationLayer _sigmoid_forget_gate;
+ NEActivationLayer _sigmoid_input_gate;
+ NEActivationLayer _sigmoid_output_gate;
+ NEActivationLayer _tanh_modulation_gate;
+ NEActivationLayer _tanh_output_state;
+ NEArithmeticAddition _add1;
+ NEArithmeticAddition _add2;
+ NEPixelWiseMultiplication _mul1;
+ NEPixelWiseMultiplication _mul2;
+ NEPixelWiseMultiplication _mul3;
+ NESlice _slice_input_tensor;
+ NESlice _slice_forget_tensor;
+ NESlice _slice_cell_tensor;
+ NESlice _slice_output_tensor;
+ NEDequantizationLayer _dequantize;
+ NEQuantizationLayer _quantize;
// Tensor pointers
const ITensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
new file mode 100644
index 0000000000..58dd7a6f6b
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Settings for MatMul Cpu implementation*/
+class CpuMatMulSettings
+{
+public:
+ // get fast math flag
+ bool fast_math() const
+ {
+ return _fast_math;
+ }
+ // get fixed format flag
+ bool fixed_format() const
+ {
+ return _fixed_format;
+ }
+ // Set fast math flag
+ CpuMatMulSettings &fast_math(bool fmath)
+ {
+ _fast_math = fmath;
+ return *this;
+ }
+ // Set fixed format flag
+ CpuMatMulSettings &fixed_format(bool fixed_format)
+ {
+ _fixed_format = fixed_format;
+ return *this;
+ }
+
+private:
+ bool _fast_math{false};
+ bool _fixed_format{false};
+};
+
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+class MatMulInfo;
+class Status;
+
+/** Basic function to run the following operators:
+ *
+ * -# @ref cpu::CpuMatMul
+ */
+class NEMatMul : public IFunction
+{
+public:
+ /** Constructor */
+ NEMatMul();
+ /** Destructor */
+ ~NEMatMul();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEMatMul(const NEMatMul &) = delete;
+ /** Default move constructor */
+ NEMatMul(NEMatMul &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEMatMul &operator=(const NEMatMul &) = delete;
+ /** Default move assignment operator */
+ NEMatMul &operator=(NEMatMul &&) = default;
+ /** Initialize
+ *
+ * Valid data layouts:
+ * - Any
+ *
+ * Valid data type configurations:
+ * |lhs |rhs |dst |
+ * |:--------------|:------------------|:--------------|
+ * |F32 |F32 |F32 |
+ * |F16 |F16 |F16 |
+ * |BFLOAT16 |BFLOAT16 |BFLOAT16 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ * |QASYMM8 |QASYMM8 |QASYMM8 |
+ *
+ * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+ * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs.
+ * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+ * @param[in] info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] settings Contains flags for function level settings i.e fast math
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+ */
+ void configure(ITensor *lhs,
+ ITensor *rhs,
+ ITensor *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
+ *
+ * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+ * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs.
+ * @param[out] dst Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+ * @param[in] info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] settings Contains flags for function level settings i.e fast math
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+ *
+ * @return Status
+ */
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index 41ea040457..e00fc4544f 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
@@ -33,12 +34,10 @@ namespace arm_compute
class ITensor;
class ITensorInfo;
class NEFill;
-class NEMaxUnpoolingLayerKernel;
/** Function to perform MaxUnpooling. This function calls the following kernels:
*
* -# @ref NEFill
- * -# @ref NEMaxUnpoolingLayerKernel
*/
class NEMaxUnpoolingLayer : public IFunction
{
@@ -88,14 +87,18 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run() override;
private:
- std::unique_ptr<NEFill> _fill_func;
- std::unique_ptr<NEMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
+ std::unique_ptr<NEFill> _fill_func;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index fbe000445c..27e3fa674e 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NENORMALIZATIONLAYER_H
#define ARM_COMPUTE_NENORMALIZATIONLAYER_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
@@ -88,16 +87,17 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
// Inherited methods overridden:
void run() override;
private:
- MemoryGroup _memory_group; /**< Function memory group */
- std::unique_ptr<NENormalizationLayerKernel> _norm_kernel; /**< Normalization layer kernel */
- NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */
- Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
+ MemoryGroup _memory_group; /**< Function memory group */
+ std::unique_ptr<NENormalizationLayerKernel> _norm_kernel; /**< Normalization layer kernel */
+ NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */
+ Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index 4aa6725496..494b1c0641 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -24,14 +24,14 @@
#ifndef ARM_COMPUTE_NEPADLAYER_H
#define ARM_COMPUTE_NEPADLAYER_H
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
#include "arm_compute/runtime/NEON/functions/NECopy.h"
#include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
#include "arm_compute/runtime/SubTensor.h"
-
-#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/Tensor.h"
+
#include <memory>
namespace arm_compute
@@ -82,7 +82,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
* or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
*/
- void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value = PixelValue(),
+ const PaddingMode mode = PaddingMode::CONSTANT);
/** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
*
* @param[in] input Source tensor info. Data types supported: All.
@@ -95,7 +99,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ const PixelValue constant_value = PixelValue(),
+ const PaddingMode mode = PaddingMode::CONSTANT);
// Inherited methods overridden:
void run() override;
@@ -109,7 +117,10 @@ private:
* specifies the front and the end padding in the i-th dimension.
* @param[in] constant_value Constant value to be used for the padding
*/
- void configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value);
+ void configure_constant_mode(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value);
/** Configure functions for when reflect or symmetric padding is used.
*
* @param[in] input Source tensor. Data types supported: All.
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index c863fde0ac..2cef64764d 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NEPERMUTE_H
#define ARM_COMPUTE_NEPERMUTE_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 4684c2d4b8..3d81bf6087 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,9 @@
#ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
#define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
+#include "arm_compute/core/Rounding.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -93,7 +95,12 @@ public:
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ void configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
*
@@ -120,7 +127,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
@@ -156,7 +168,10 @@ public:
* @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
*
* @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -164,7 +179,10 @@ public:
* @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
new file mode 100644
index 0000000000..09251f2a5f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPOOLING3DLAYER_H
+#define ARM_COMPUTE_NEPOOLING3DLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+class IMemoryManager;
+/** Basic function to simulate a pooling 3d layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref cpu::CpuPool3d
+ */
+class NEPooling3dLayer : public IFunction
+{
+public:
+ /** Constructor */
+ NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEPooling3dLayer(const NEPooling3dLayer &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEPooling3dLayer &operator=(const NEPooling3dLayer &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ NEPooling3dLayer(NEPooling3dLayer &&) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ NEPooling3dLayer &operator=(NEPooling3dLayer &&) = delete;
+ /** Default destructor */
+ ~NEPooling3dLayer();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NDHWC
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:--------------|
+ * |F16 |F16 |
+ * |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ *
+ * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+ *
+ * @param[in] input Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor.
+ * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+ */
+ void configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEPooling3dLayer
+ *
+ *
+ * @param[in] input Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] output Destination tensor info.
+ * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 9398e1fce9..768ad0d818 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -71,6 +71,8 @@ public:
* |F32 |F32 |
*
* @note F16 is supported for pool sizes 2 and 3 only
+ * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+ * Cases where pooling region is completely outside input tensor are only supported for floating point data type
*
* @param[in, out] input Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
@@ -89,7 +91,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices = nullptr);
// Inherited methods overridden:
void run() override;
@@ -98,5 +103,5 @@ private:
struct Impl;
std::unique_ptr<Impl> _impl;
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index 38e0c9f3ad..858e3299af 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
@@ -62,7 +62,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 7c2e9bc5a1..009a4e0911 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,15 +25,17 @@
#define ARM_COMPUTE_NEQLSTMLAYER_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
#include "arm_compute/runtime/NEON/functions/NECopy.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
#include <memory>
@@ -43,8 +45,13 @@ namespace arm_compute
class ITensor;
class ITensorInfo;
class NEQLSTMLayerNormalizationKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace cpu
/** Basic function to run @ref NEQLSTMLayer
*
* This function calls the following kernels:
@@ -54,8 +61,8 @@ class NEGEMMLowpMatrixAReductionKernel;
* -# @ref NEArithmeticSubtraction Elementwise subtraction
* -# @ref NECopy Copy kernel for copying output_state_out to output
* -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
- * -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use
+ * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
+ * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel For precomputing effective biases to use
* -# @ref NEPixelWiseMultiplication Elementwise multiplication
* -# @ref NETranspose Transpose function for reshaping the weights
* */
@@ -123,12 +130,21 @@ public:
* projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
* [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
*/
- void configure(const ITensor *input,
- const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- const ITensor *cell_state_in, ITensor *output_state_in,
- ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+ void configure(const ITensor *input,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ const ITensor *cell_state_in,
+ ITensor *output_state_in,
+ ITensor *cell_state_out,
+ ITensor *output_state_out,
+ ITensor *output,
const LSTMParams<ITensor> &lstm_params);
/** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer
@@ -173,12 +189,21 @@ public:
* [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
* @return a status
*/
- static Status validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *output,
const LSTMParams<ITensorInfo> &lstm_params);
// Inherited methods overridden:
@@ -211,10 +236,17 @@ private:
* @param[in] mm_res_info Tensor info to be used to initialize output stage result tensor.
*
*/
- void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
- const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res,
- Tensor *outstage_res, float gemmlowp_scale,
- const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+ void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+ NEGEMMLowpOutputStage &outstage,
+ GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ITensor *mm_input,
+ const ITensor *mm_weights,
+ const ITensor *bias,
+ Tensor *mm_res,
+ Tensor *outstage_res,
+ float gemmlowp_scale,
+ const TensorInfo &mm_res_info,
+ const TensorInfo &outstage_tensor_info);
MemoryGroup _memory_group;
@@ -223,8 +255,8 @@ private:
{
static constexpr uint32_t max_dimension_supported = 2;
- ITensor *_src{ nullptr };
- ITensor *_dst{ nullptr };
+ ITensor *_src{nullptr};
+ ITensor *_dst{nullptr};
size_t _row_size{};
Window _window{};
@@ -250,70 +282,73 @@ private:
};
// Functions used
- NETranspose _transpose_input_to_forget_weights;
- NETranspose _transpose_input_to_cell_weights;
- NETranspose _transpose_input_to_output_weights;
- NETranspose _transpose_input_to_input_weights;
- NETranspose _transpose_recurrent_to_forget_weights;
- NETranspose _transpose_recurrent_to_cell_weights;
- NETranspose _transpose_recurrent_to_output_weights;
- NETranspose _transpose_recurrent_to_input_weights;
- NETranspose _transpose_projection_weights;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _projection_reduction;
- NEArithmeticAddition _projection_bias_add;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget;
- NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget;
- NEGEMMLowpOutputStage _input_to_forget_outstage;
- NEGEMMLowpOutputStage _recurrent_to_forget_outstage;
- NEGEMMLowpOutputStage _cell_to_forget_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_forget;
- NEArithmeticAddition _accumulate_cell_forget;
- NEActivationLayer _forget_gate_sigmoid;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell;
- NEGEMMLowpOutputStage _input_to_cell_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell;
- NEGEMMLowpOutputStage _recurrent_to_cell_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_modulation;
- NEActivationLayer _cell_gate_tanh;
- NEArithmeticSubtraction _input_gate_sub;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_input;
- NEGEMMLowpOutputStage _input_to_input_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input;
- NEGEMMLowpOutputStage _recurrent_to_input_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_input;
- NEPixelWiseMultiplication _pixelwise_mul_cell_to_input;
- NEGEMMLowpOutputStage _cell_to_input_outstage;
- NEArithmeticAddition _accumulate_cell_input;
- NEActivationLayer _input_gate_sigmoid;
- NEPixelWiseMultiplication _pixelwise_mul_forget_cell;
- NEPixelWiseMultiplication _pixelwise_mul_input_cell;
- NEArithmeticAddition _add_forget_cell;
- NEActivationLayer _cell_clip;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_output;
- NEGEMMLowpOutputStage _input_to_output_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output;
- NEGEMMLowpOutputStage _recurrent_to_output_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_output;
- NEPixelWiseMultiplication _pixelwise_mul_cell_to_output;
- NEGEMMLowpOutputStage _cell_to_output_outstage;
- NEArithmeticAddition _accumulate_cell_to_output;
- NEActivationLayer _output_gate_sigmoid;
- NEActivationLayer _hidden_tanh;
- NEPixelWiseMultiplication _pixelwise_mul_hidden;
- NEGEMMLowpOutputStage _hidden_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_projection;
- NEGEMMLowpOutputStage _projection_outstage;
- NEArithmeticAddition _accumulate_projection;
- NEActivationLayer _projection_clip;
+
+ NEDequantizationLayer _dequantize_input_to_forget_weights;
+ NEQuantizationLayer _quantize_input_to_forget_weights;
+ NETranspose _transpose_input_to_forget_weights;
+ NETranspose _transpose_input_to_cell_weights;
+ NETranspose _transpose_input_to_output_weights;
+ NETranspose _transpose_input_to_input_weights;
+ NETranspose _transpose_recurrent_to_forget_weights;
+ NETranspose _transpose_recurrent_to_cell_weights;
+ NETranspose _transpose_recurrent_to_output_weights;
+ NETranspose _transpose_recurrent_to_input_weights;
+ NETranspose _transpose_projection_weights;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _projection_reduction;
+ NEArithmeticAddition _projection_bias_add;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget;
+ NEGEMMLowpOutputStage _input_to_forget_outstage;
+ NEGEMMLowpOutputStage _recurrent_to_forget_outstage;
+ NEGEMMLowpOutputStage _cell_to_forget_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_forget;
+ NEArithmeticAddition _accumulate_cell_forget;
+ NEActivationLayer _forget_gate_sigmoid;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell;
+ NEGEMMLowpOutputStage _input_to_cell_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell;
+ NEGEMMLowpOutputStage _recurrent_to_cell_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_modulation;
+ NEActivationLayer _cell_gate_tanh;
+ NEArithmeticSubtraction _input_gate_sub;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_input;
+ NEGEMMLowpOutputStage _input_to_input_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input;
+ NEGEMMLowpOutputStage _recurrent_to_input_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_input;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_input;
+ NEGEMMLowpOutputStage _cell_to_input_outstage;
+ NEArithmeticAddition _accumulate_cell_input;
+ NEActivationLayer _input_gate_sigmoid;
+ NEPixelWiseMultiplication _pixelwise_mul_forget_cell;
+ NEPixelWiseMultiplication _pixelwise_mul_input_cell;
+ NEArithmeticAddition _add_forget_cell;
+ NEActivationLayer _cell_clip;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_output;
+ NEGEMMLowpOutputStage _input_to_output_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output;
+ NEGEMMLowpOutputStage _recurrent_to_output_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_output;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_output;
+ NEGEMMLowpOutputStage _cell_to_output_outstage;
+ NEArithmeticAddition _accumulate_cell_to_output;
+ NEActivationLayer _output_gate_sigmoid;
+ NEActivationLayer _hidden_tanh;
+ NEPixelWiseMultiplication _pixelwise_mul_hidden;
+ NEGEMMLowpOutputStage _hidden_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_projection;
+ NEGEMMLowpOutputStage _projection_outstage;
+ NEArithmeticAddition _accumulate_projection;
+ NEActivationLayer _projection_clip;
TensorCopyKernel _projection_bias_copy;
TensorCopyKernel _projection_output_to_accumulate_copy;
@@ -325,19 +360,16 @@ private:
NECopy _copy_output;
// Tensor pointers
- const ITensor *_input_to_input_weights
- {
- nullptr
- };
- const ITensor *_recurrent_to_input_weights{ nullptr };
- const ITensor *_projection_bias{ nullptr };
- const ITensor *_input_to_forget_weights{ nullptr };
- const ITensor *_input_to_cell_weights{ nullptr };
- const ITensor *_input_to_output_weights{ nullptr };
- const ITensor *_recurrent_to_forget_weights{ nullptr };
- const ITensor *_recurrent_to_cell_weights{ nullptr };
- const ITensor *_recurrent_to_output_weights{ nullptr };
- const ITensor *_projection_weights{ nullptr };
+ const ITensor *_input_to_input_weights{nullptr};
+ const ITensor *_recurrent_to_input_weights{nullptr};
+ const ITensor *_projection_bias{nullptr};
+ const ITensor *_input_to_forget_weights{nullptr};
+ const ITensor *_input_to_cell_weights{nullptr};
+ const ITensor *_input_to_output_weights{nullptr};
+ const ITensor *_recurrent_to_forget_weights{nullptr};
+ const ITensor *_recurrent_to_cell_weights{nullptr};
+ const ITensor *_recurrent_to_output_weights{nullptr};
+ const ITensor *_projection_weights{nullptr};
std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
@@ -372,63 +404,66 @@ private:
return _layer_norms[getGateIndex(g)];
}
- void configure_layer_norm(LayerNormGate g, const ITensor *in);
+ void configure_layer_norm(LayerNormGate g, const ITensor *in);
static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
// Temporary tensors
- Tensor _input_to_forget_weights_transposed{ nullptr };
- Tensor _input_to_cell_weights_transposed{ nullptr };
- Tensor _input_to_output_weights_transposed{ nullptr };
- Tensor _input_to_input_weights_transposed{ nullptr };
- Tensor _recurrent_to_forget_weights_transposed{ nullptr };
- Tensor _recurrent_to_cell_weights_transposed{ nullptr };
- Tensor _recurrent_to_output_weights_transposed{ nullptr };
- Tensor _recurrent_to_input_weights_transposed{ nullptr };
- Tensor _projection_weights_transposed{ nullptr };
- Tensor _input_to_input_eff_bias{ nullptr };
- Tensor _recurrent_to_input_eff_bias{ nullptr };
- Tensor _input_to_forget_eff_bias{ nullptr };
- Tensor _recurrent_to_forget_eff_bias{ nullptr };
- Tensor _input_to_cell_eff_bias{ nullptr };
- Tensor _recurrent_to_cell_eff_bias{ nullptr };
- Tensor _input_to_output_eff_bias{ nullptr };
- Tensor _recurrent_to_output_eff_bias{ nullptr };
- Tensor _projection_reduction_res{ nullptr };
- Tensor _projection_eff_bias{ nullptr };
- Tensor _mm_input_to_forget_res{ nullptr };
- Tensor _mm_recurrent_to_forget_res{ nullptr };
- Tensor _mul_cell_to_forget_res{ nullptr };
- Tensor _input_to_forget_outstage_res{ nullptr };
- Tensor _cell_to_forget_outstage_res{ nullptr };
- Tensor _recurrent_to_forget_outstage_res{ nullptr };
- Tensor _forget_gate{ nullptr };
- Tensor _mm_input_to_cell_res{ nullptr };
- Tensor _input_to_cell_outstage_res{ nullptr };
- Tensor _mm_recurrent_to_cell_res{ nullptr };
- Tensor _recurrent_to_cell_outstage_res{ nullptr };
- Tensor _cell_gate{ nullptr };
- Tensor _mul_input_cell_res{ nullptr };
- Tensor _mm_input_to_input_res{ nullptr };
- Tensor _input_to_input_outstage_res{ nullptr };
- Tensor _mm_recurrent_to_input_res{ nullptr };
- Tensor _mul_cell_to_input_res{ nullptr };
- Tensor _cell_to_input_outstage_res{ nullptr };
- Tensor _recurrent_to_input_outstage_res{ nullptr };
- Tensor _input_gate{ nullptr };
- Tensor _mm_input_to_output_res{ nullptr };
- Tensor _input_to_output_outstage_res{ nullptr };
- Tensor _mm_recurrent_to_output_res{ nullptr };
- Tensor _mul_cell_to_output_res{ nullptr };
- Tensor _cell_to_output_outstage_res{ nullptr };
- Tensor _recurrent_to_output_outstage_res{ nullptr };
- Tensor _output_gate{ nullptr };
- Tensor _hidden_mul_res{ nullptr };
- Tensor _hidden_gate{ nullptr };
- Tensor _mm_projection_res{ nullptr };
- Tensor _projection_outstage_res{ nullptr };
- Tensor _projection_out_res{ nullptr };
- Tensor _projection_accumulate_res{ nullptr };
- Tensor _ones{ nullptr };
+ Tensor _input_to_forget_weights_f32{nullptr};
+ Tensor _input_to_forget_weights_symm8{nullptr};
+
+ Tensor _input_to_forget_weights_transposed{nullptr};
+ Tensor _input_to_cell_weights_transposed{nullptr};
+ Tensor _input_to_output_weights_transposed{nullptr};
+ Tensor _input_to_input_weights_transposed{nullptr};
+ Tensor _recurrent_to_forget_weights_transposed{nullptr};
+ Tensor _recurrent_to_cell_weights_transposed{nullptr};
+ Tensor _recurrent_to_output_weights_transposed{nullptr};
+ Tensor _recurrent_to_input_weights_transposed{nullptr};
+ Tensor _projection_weights_transposed{nullptr};
+ Tensor _input_to_input_eff_bias{nullptr};
+ Tensor _recurrent_to_input_eff_bias{nullptr};
+ Tensor _input_to_forget_eff_bias{nullptr};
+ Tensor _recurrent_to_forget_eff_bias{nullptr};
+ Tensor _input_to_cell_eff_bias{nullptr};
+ Tensor _recurrent_to_cell_eff_bias{nullptr};
+ Tensor _input_to_output_eff_bias{nullptr};
+ Tensor _recurrent_to_output_eff_bias{nullptr};
+ Tensor _projection_reduction_res{nullptr};
+ Tensor _projection_eff_bias{nullptr};
+ Tensor _mm_input_to_forget_res{nullptr};
+ Tensor _mm_recurrent_to_forget_res{nullptr};
+ Tensor _mul_cell_to_forget_res{nullptr};
+ Tensor _input_to_forget_outstage_res{nullptr};
+ Tensor _cell_to_forget_outstage_res{nullptr};
+ Tensor _recurrent_to_forget_outstage_res{nullptr};
+ Tensor _forget_gate{nullptr};
+ Tensor _mm_input_to_cell_res{nullptr};
+ Tensor _input_to_cell_outstage_res{nullptr};
+ Tensor _mm_recurrent_to_cell_res{nullptr};
+ Tensor _recurrent_to_cell_outstage_res{nullptr};
+ Tensor _cell_gate{nullptr};
+ Tensor _mul_input_cell_res{nullptr};
+ Tensor _mm_input_to_input_res{nullptr};
+ Tensor _input_to_input_outstage_res{nullptr};
+ Tensor _mm_recurrent_to_input_res{nullptr};
+ Tensor _mul_cell_to_input_res{nullptr};
+ Tensor _cell_to_input_outstage_res{nullptr};
+ Tensor _recurrent_to_input_outstage_res{nullptr};
+ Tensor _input_gate{nullptr};
+ Tensor _mm_input_to_output_res{nullptr};
+ Tensor _input_to_output_outstage_res{nullptr};
+ Tensor _mm_recurrent_to_output_res{nullptr};
+ Tensor _mul_cell_to_output_res{nullptr};
+ Tensor _cell_to_output_outstage_res{nullptr};
+ Tensor _recurrent_to_output_outstage_res{nullptr};
+ Tensor _output_gate{nullptr};
+ Tensor _hidden_mul_res{nullptr};
+ Tensor _hidden_gate{nullptr};
+ Tensor _mm_projection_res{nullptr};
+ Tensor _projection_outstage_res{nullptr};
+ Tensor _projection_out_res{nullptr};
+ Tensor _projection_accumulate_res{nullptr};
+ Tensor _ones{nullptr};
std::array<Tensor, _layer_norm_count> _layer_norm_output{};
inline Tensor &get_layer_norm_output(LayerNormGate g)
@@ -436,14 +471,15 @@ private:
return _layer_norm_output[getGateIndex(g)];
}
- bool _is_prepared{ false };
- bool _has_cifg{ false };
- bool _has_cell_clipping{ false };
- bool _has_projection{ false };
- bool _has_projection_clipping{ false };
- bool _has_peephole{ false };
- bool _has_layer_norm{ false };
- bool _projection_tensor_copy_required{ false };
+ bool _is_prepared{false};
+ bool _has_cifg{false};
+ bool _has_cell_clipping{false};
+ bool _has_projection{false};
+ bool _has_projection_clipping{false};
+ bool _has_peephole{false};
+ bool _has_layer_norm{false};
+ bool _projection_tensor_copy_required{false};
+ bool _convert_input_to_forget_weights_to_qsymm8{false};
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEQLSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 667d3144ac..af7f464ac9 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -72,7 +72,13 @@ public:
* @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
* @param[in] info Activation layer parameter.
*/
- void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info);
+ void configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *recurrent_weights,
+ const ITensor *bias,
+ ITensor *hidden_state,
+ ITensor *output,
+ ActivationLayerInfo &info);
/** Initialize the function
*
* @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -85,7 +91,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *hidden_state,
+ const ITensorInfo *output,
const ActivationLayerInfo &info);
// Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index ea1af4daea..b06ebe899d 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -77,7 +77,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 2992b3eb95..929111ad4b 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/IArray.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
@@ -73,7 +74,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
*/
- void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run() override;
@@ -91,7 +93,10 @@ public:
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
private:
std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index cb14c8fdde..609456a4ef 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
+
#include <memory>
namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index 7512115a3f..5b8d8cdf2b 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,12 +24,9 @@
#ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H
#define ARM_COMPUTE_NEON_REDUCE_MEAN_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/runtime/Tensor.h"
@@ -83,7 +80,8 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+ static Status
+ validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
// Inherited methods overridden:
void run() override;
@@ -93,13 +91,8 @@ private:
std::vector<NEReductionOperation> _reduction_kernels;
std::vector<Tensor> _reduced_outs;
NEReshapeLayer _reshape;
- NEDequantizationLayer _dequant;
- NEQuantizationLayer _requant;
int _reduction_ops;
bool _keep_dims;
- bool _do_requant;
- Tensor _input_no_quant;
- Tensor _output_no_quant;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 533c10adcf..f5391a6d0e 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -25,9 +25,9 @@
#define ARM_COMPUTE_NEREDUCTIONOPERATION_H
#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/runtime/Tensor.h"
+
#include <memory>
namespace arm_compute
@@ -88,7 +88,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op,
+ bool keep_dims = true);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
deleted file mode 100644
index 271ac9739b..0000000000
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAP_H
-#define ARM_COMPUTE_NEREMAP_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute remap. This function calls the following kernels:
- *
- * -# @ref NERemapKernel
- */
-class NERemap : public INESimpleFunctionNoBorder
-{
-public:
- /** Initialise the function's sources, destination, interpolation policy and border mode.
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src0 |src1 |src2 |dst |
- * |:------|:------|:------|:------|
- * |U8 |F32 |F32 |U 8 |
- *
- * @param[in, out] input Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
- * @param[in] map_x Map for X coordinates. Data type supported: F32.
- * @param[in] map_y Map for Y coordinates. Data type supported: F32.
- * @param[out] output Output tensor. Data type supported: U8.
- * @param[in] policy Interpolation policy to use. Only NEAREST and BILINEAR are supported.
- * @param[in] border_mode Border mode to use on the input tensor.
- * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0.
- *
- */
- void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
- InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEREMAP_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReorderLayer.h b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
new file mode 100644
index 0000000000..e3fa7b9c16
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+class NEReorderKernel;
+/** Function to compute blocked reorder. */
+class NEReorderLayer : public IFunction
+{
+public:
+ /** Default constructor */
+ NEReorderLayer();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEReorderLayer(const NEReorderLayer &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEReorderLayer &operator=(const NEReorderLayer &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ NEReorderLayer(NEReorderLayer &&) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ NEReorderLayer &operator=(NEReorderLayer &&) = delete;
+ /** Default destructor */
+ ~NEReorderLayer();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------|:---------|
+ * |F32 |F32 |
+ *
+ * @param[in] input Source tensor. Data type supported: F32. Data layouts supported: NCHW.
+ * @param[out] output Destination with the same dimensions, data type, data layout as @p input
+ * except last dimension of data layout which needs to be multiple of blocking parameter ksize
+ * @param[in] input_wf WeightFormat of input.
+ * @param[in] output_wf WeightFormat of output.
+ */
+ void configure(const ITensor *input,
+ ITensor *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref NEReorderLayer
+ *
+ * Similar to @ref NEReorderLayer::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ std::unique_ptr<NEReorderKernel> _reorder_kernel; /**< Reorder layer kernel */
+};
+} // namespace arm_compute
+#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER */
+
+#endif // defined(__aarch64__)
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index c02fff54a5..e03e415068 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEREVERSE_H
-#define ARM_COMPUTE_NEREVERSE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
namespace arm_compute
{
@@ -45,22 +44,33 @@ public:
* Valid data type configurations:
* |src0 |src1 |dst |
* |:--------------|:--------------|:--------------|
- * |All |U32 |All |
+ * |All |U32, S32 |All |
+ *
+ * @param[in] input Input tensor. Data types supported: All
+ * @param[out] output Output tensor. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
+ *
+ * @note The value of each axis should be between [-rank, rank)
+ * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+ *
+ * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
*
- * @param[in] input Input tensor. Data types supported: All
- * @param[out] output Output tensor. Data type supported: Same as @p input
- * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
*/
- void configure(const ITensor *input, ITensor *output, const ITensor *axis);
+ void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false);
/** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
*
- * @param[in] input Input tensor info. Data types supported: All
- * @param[in] output Output tensor info. Data type supported: Same as @p input
- * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+ * @param[in] input Input tensor info. Data types supported: All
+ * @param[in] output Output tensor info. Data type supported: Same as @p input
+ * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+ * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ const bool use_inverted_axis = false);
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEREVERSE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 0b7dddacb2..72dfa3bda4 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,10 +24,9 @@
#ifndef ARM_COMPUTE_NESCALEIMAGE_H
#define ARM_COMPUTE_NESCALEIMAGE_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -66,16 +65,19 @@ public:
* |F16 |F16 |
* |F32 |F32 |
* |U8 |U8 |
+ * |S8 |S8 |
* |S16 |S16 |
*
- * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+ * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
* @param[out] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] info @ref ScaleKernelInfo to be used for configuration
+ *
+ * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear
*/
void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref NEScale
*
- * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+ * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
* @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] info @ref ScaleKernelInfo to be used for validation
*
diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index ac79a5c633..70a688d3b0 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h
@@ -85,7 +85,8 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
// Inherited methods overridden:
void run() override;
@@ -129,7 +130,8 @@ public:
*
* @return A status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
};
} // namespace experimental
} // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index ad8c1467d0..5dee61a4a8 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -24,9 +24,9 @@
#ifndef ARM_COMPUTE_NESPACETOBATCHLAYER_H
#define ARM_COMPUTE_NESPACETOBATCHLAYER_H
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/Types.h"
#include <memory>
namespace arm_compute
@@ -82,7 +82,12 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+ void configure(const ITensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -92,7 +97,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer (Static block shape and paddings)
*
* @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -104,7 +112,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h
index 206f299c06..36358a7094 100644
--- a/arm_compute/runtime/NEON/functions/NESplit.h
+++ b/arm_compute/runtime/NEON/functions/NESplit.h
@@ -26,7 +26,6 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-
#include "arm_compute/runtime/CPP/functions/CPPSplit.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/NEON/functions/NESlice.h"
diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index ae4e468f21..98dacde0c1 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NESTACKLAYER_H
-#define ARM_COMPUTE_NESTACKLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
@@ -91,9 +91,8 @@ public:
void run() override;
private:
- std::vector<ITensor *> _input;
- std::vector<std::unique_ptr<NEStackLayerKernel>> _stack_kernels;
- unsigned int _num_inputs;
+ std::unique_ptr<NEStackLayerKernel> _stack_kernel;
+ bool _is_prepared;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index 4b14d946f6..fa1113ffec 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
@@ -71,9 +71,14 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const ITensor *input, ITensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ void configure(const ITensor *input,
+ ITensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
/** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
*
@@ -89,9 +94,14 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
// Inherited methods overridden:
void run() override;
@@ -121,9 +131,14 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ void configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
/** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
*
@@ -139,9 +154,14 @@ public:
* @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask = 0,
+ int32_t end_mask = 0,
+ int32_t shrink_axis_mask = 0);
};
} // namespace experimental
} // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index 915e5aa1da..001a0a4128 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NETILE_H
#define ARM_COMPUTE_NETILE_H
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
namespace arm_compute
{
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 581fe74309..5d2d1f1b01 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -24,9 +24,8 @@
#ifndef ARM_COMPUTE_NETRANSPOSE_H
#define ARM_COMPUTE_NETRANSPOSE_H
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -83,4 +82,4 @@ private:
std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NETRANSPOSE_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_NETRANSPOSE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index 079fee5b9e..e1af96d08d 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -26,7 +26,6 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
#include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index f9ebf608cb..6caa2aeb59 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,17 +21,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
@@ -40,13 +35,11 @@ namespace arm_compute
{
// Forward declarations
class ITensor;
-class ICPPKernel;
/** Basic function to simulate a convolution layer. This function calls the following kernels:
*
- * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method )
- * -# @ref NEWinogradLayerTransformInputKernel
- * -# @ref NEWinogradLayerTransformOutputKernel
+ * -# @ref cpu::CpuWinogradConv2dTransformInputKernel
+ * -# @ref cpu::CpuWinogradConv2dTransformOutputKernel
* -# @ref cpu::CpuGemmAssemblyDispatch
* -# @ref CPPPermute (three times: weights, input and output)
*
@@ -57,12 +50,16 @@ class NEWinogradConvolutionLayer : public IFunction
public:
/** Constructor */
NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = delete;
- /** Default destructor */
- ~NEWinogradConvolutionLayer() = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete;
+ /** Default move constructor */
+ NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete;
+ /** Default move assignment operator */
+ NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = default;
+ /** Destructor */
+ ~NEWinogradConvolutionLayer();
/** Set the input and output tensors.
*
@@ -80,7 +77,8 @@ public:
* while every optional dimension from 4 and above represent a batch of inputs.
* Data types supported: F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
- * Currently only 3x3 and 5x5 kernels are supported.
+ * Supported kernel sizes: (height, width) -> 3x3, 1x3, 3x1, 5x5, 1x5, 5x1 for Fp32
+ * -> 3x3 for Fp16
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
* @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
* Data types supported: Same as @p input.
@@ -89,63 +87,35 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*/
- void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- bool enable_fast_math = false);
+ void configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
void prepare() override;
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
+ /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer
*
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
- * Currently only 3x3 and 5x5 kernels are supported.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
- * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
- * available which may introduce a drop of accuracy as well. Default is false
+ * Similar to @ref NEWinogradConvolutionLayer::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
-
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete;
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
private:
- MemoryGroup _memory_group;
- NEGEMM _gemm_function;
- std::unique_ptr<ICPPKernel> _transform_input_kernel;
- std::unique_ptr<ICPPKernel> _transform_output_kernel;
- std::unique_ptr<ICPPKernel> _transform_weights_kernel;
- NEActivationLayer _activationlayer_function;
-
- CPPPermute _permute_input;
- CPPPermute _permute_weights;
- CPPPermute _permute_output;
- Tensor _input_transformed;
- Tensor _output_transformed;
- Tensor _input_workspace;
- Tensor _output_workspace;
- Tensor _kernel_storage;
- Tensor _input_nhwc;
- Tensor _output_nhwc;
- Tensor _weights_hwio;
- const ITensor *_input;
- const ITensor *_weights;
- ITensor *_output;
- bool _is_prepared;
- bool _is_activationlayer_enabled;
- DataLayout _data_layout;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index b522b403a9..c718e74359 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_OMPSCHEDULER_H
-#define ARM_COMPUTE_OMPSCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
#include "arm_compute/runtime/IScheduler.h"
@@ -79,6 +79,8 @@ protected:
private:
unsigned int _num_threads;
+ bool _has_lmb;
+ unsigned int _nonlittle_num_cpus;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_OMPSCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
diff --git a/arm_compute/runtime/OffsetLifetimeManager.h b/arm_compute/runtime/OffsetLifetimeManager.h
index 2eef61a236..13ebb9fbe3 100644
--- a/arm_compute/runtime/OffsetLifetimeManager.h
+++ b/arm_compute/runtime/OffsetLifetimeManager.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H
#include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
#include "arm_compute/runtime/Types.h"
#include <map>
@@ -62,7 +61,7 @@ public:
// Inherited methods overridden:
std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
- MappingType mapping_type() const override;
+ MappingType mapping_type() const override;
private:
// Inherited methods overridden:
diff --git a/arm_compute/runtime/OffsetMemoryPool.h b/arm_compute/runtime/OffsetMemoryPool.h
index a5c363d866..7250194f85 100644
--- a/arm_compute/runtime/OffsetMemoryPool.h
+++ b/arm_compute/runtime/OffsetMemoryPool.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_OFFSETMEMORYPOOL_H
#include "arm_compute/runtime/IMemoryPool.h"
-
#include "arm_compute/runtime/IMemoryRegion.h"
#include "arm_compute/runtime/Types.h"
@@ -65,8 +64,8 @@ public:
const BlobInfo &info() const;
// Inherited methods overridden:
- void acquire(MemoryMappings &handles) override;
- void release(MemoryMappings &handles) override;
+ void acquire(MemoryMappings &handles) override;
+ void release(MemoryMappings &handles) override;
MappingType mapping_type() const override;
std::unique_ptr<IMemoryPool> duplicate() override;
diff --git a/arm_compute/runtime/OperatorList.h b/arm_compute/runtime/OperatorList.h
index 8f1f4ba0a9..8bcdf5d3bf 100644
--- a/arm_compute/runtime/OperatorList.h
+++ b/arm_compute/runtime/OperatorList.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_OPERATOR_LIST_H
-#define ARM_COMPUTE_OPERATOR_LIST_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
+#define ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
/** ActivationLayer
*
@@ -40,6 +40,16 @@
*
*/
+/** AddMulAdd
+ *
+ * Description:
+ * Performs a fused Add + Mul + Add [+ Relu-based-Activation] operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
/** ArgMinMaxLayer
*
* Description:
@@ -206,6 +216,16 @@
*
*/
+/** Conv3D
+ *
+ * Description:
+ * Function to compute a 3d convolution layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_3D
+ *
+ */
+
/** Copy
*
* Description:
@@ -637,6 +657,16 @@
*
*/
+/** MatMul
+ *
+ * Description:
+ * Computes a matrix multiplication in batches.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_BATCH_MATMUL
+ *
+ */
+
/** MaxUnpoolingLayer
*
* Description:
@@ -667,6 +697,16 @@
*
*/
+/** NormalizePlanarYUVLayer
+ *
+ * Description:
+ * Function to compute normalization planar YUV layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
/** PadLayer
*
* Description:
@@ -710,6 +750,16 @@
*
*/
+/** Pooling3dLayer
+ *
+ * Description:
+ * Function to perform pooling 3D with the specified pooling operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * N/A
+ *
+ */
+
/** PReluLayer
*
* Description:
@@ -794,6 +844,16 @@
*
*/
+/** ReorderLayer
+ *
+ * Description:
+ * Reorders a tensor to a different weights format.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
/** ReorgLayer
*
* Description:
@@ -989,14 +1049,4 @@
*
*/
-/** WinogradInputTransform
- *
- * Description:
- * Function to perform a Winograd transform on the input tensor.
- *
- * Equivalent Android NNAPI Op:
- * n/a
- *
- */
-
-#endif /* ARM_COMPUTE_OPERATOR_LIST_H */ \ No newline at end of file
+#endif // ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
diff --git a/arm_compute/runtime/OperatorTensor.h b/arm_compute/runtime/OperatorTensor.h
index 92ae01934b..237585bec2 100644
--- a/arm_compute/runtime/OperatorTensor.h
+++ b/arm_compute/runtime/OperatorTensor.h
@@ -26,8 +26,8 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Types.h"
#include "arm_compute/runtime/experimental/Types.h"
+#include "arm_compute/runtime/Types.h"
#include <cstdint>
diff --git a/arm_compute/runtime/PoolManager.h b/arm_compute/runtime/PoolManager.h
index cc50fc04a4..6aa6aef6e2 100644
--- a/arm_compute/runtime/PoolManager.h
+++ b/arm_compute/runtime/PoolManager.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_POOLMANAGER_H
#define ARM_COMPUTE_POOLMANAGER_H
-#include "arm_compute/runtime/IPoolManager.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/IPoolManager.h"
+
#include "support/Mutex.h"
#include "support/Semaphore.h"
@@ -53,9 +53,9 @@ public:
PoolManager &operator=(PoolManager &&) = delete;
// Inherited methods overridden:
- IMemoryPool *lock_pool() override;
- void unlock_pool(IMemoryPool *pool) override;
- void register_pool(std::unique_ptr<IMemoryPool> pool) override;
+ IMemoryPool *lock_pool() override;
+ void unlock_pool(IMemoryPool *pool) override;
+ void register_pool(std::unique_ptr<IMemoryPool> pool) override;
std::unique_ptr<IMemoryPool> release_pool() override;
void clear_pools() override;
size_t num_pools() const override;
@@ -66,5 +66,5 @@ private:
std::unique_ptr<arm_compute::Semaphore> _sem; /**< Semaphore to control the queues */
mutable arm_compute::Mutex _mtx; /**< Mutex to control access to the queues */
};
-} // arm_compute
+} // namespace arm_compute
#endif /*ARM_COMPUTE_POOLMANAGER_H */
diff --git a/arm_compute/runtime/RuntimeContext.h b/arm_compute/runtime/RuntimeContext.h
index 23bd267375..d64e609196 100644
--- a/arm_compute/runtime/RuntimeContext.h
+++ b/arm_compute/runtime/RuntimeContext.h
@@ -54,8 +54,8 @@ public:
IAssetManager *asset_manager() override;
private:
- std::unique_ptr<IScheduler> _owned_scheduler{ nullptr };
- IScheduler *_scheduler{ nullptr };
+ std::unique_ptr<IScheduler> _owned_scheduler{nullptr};
+ IScheduler *_scheduler{nullptr};
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_RUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index 9e8add1f95..7c83f86caa 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_SCHEDULER_H
-#define ARM_COMPUTE_SCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
#include "arm_compute/runtime/IScheduler.h"
@@ -74,11 +74,11 @@ public:
static bool is_available(Type t);
private:
- static Type _scheduler_type;
- static std::shared_ptr<IScheduler> _custom_scheduler;
+ static Type _scheduler_type;
+ static std::shared_ptr<IScheduler> _custom_scheduler;
static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
Scheduler();
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_SCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
index 3ca066e1c8..2badb31b26 100644
--- a/arm_compute/runtime/SubTensor.h
+++ b/arm_compute/runtime/SubTensor.h
@@ -72,5 +72,5 @@ private:
ITensor *_parent;
mutable SubTensorInfo _info;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_SUBTENSOR_H */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
index 172c8963f0..e71fbd4a96 100644
--- a/arm_compute/runtime/Tensor.h
+++ b/arm_compute/runtime/Tensor.h
@@ -59,7 +59,7 @@ public:
ITensorInfo *info() const override;
ITensorInfo *info() override;
uint8_t *buffer() const override;
- void associate_memory_group(IMemoryGroup *memory_group) override;
+ void associate_memory_group(IMemoryGroup *memory_group) override;
private:
mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
index a5e16c4d90..d819931415 100644
--- a/arm_compute/runtime/TensorAllocator.h
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -24,7 +24,6 @@
#ifndef ARM_COMPUTE_TENSORALLOCATOR_H
#define ARM_COMPUTE_TENSORALLOCATOR_H
#include "arm_compute/runtime/ITensorAllocator.h"
-
#include "arm_compute/runtime/Memory.h"
#include "arm_compute/runtime/MemoryGroup.h"
diff --git a/arm_compute/runtime/common/LSTMParams.h b/arm_compute/runtime/common/LSTMParams.h
index aedb9c0d46..6800faf87f 100644
--- a/arm_compute/runtime/common/LSTMParams.h
+++ b/arm_compute/runtime/common/LSTMParams.h
@@ -79,7 +79,10 @@ public:
*
* @return Reference to this LSTMParams object
*/
- LSTMParams &set_cifg_params(const T *input_to_input_weights, const T *recurrent_to_input_weights, T *cell_to_input_weights, const T *input_gate_bias)
+ LSTMParams &set_cifg_params(const T *input_to_input_weights,
+ const T *recurrent_to_input_weights,
+ T *cell_to_input_weights,
+ const T *input_gate_bias)
{
_input_to_input_weights = input_to_input_weights;
_recurrent_to_input_weights = recurrent_to_input_weights;
@@ -125,8 +128,10 @@ public:
*
* @return Reference to this LSTMParams object
*/
- LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights, T *forget_layer_norm_weights,
- T *cell_layer_norm_weights, T *output_layer_norm_weights)
+ LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights,
+ T *forget_layer_norm_weights,
+ T *cell_layer_norm_weights,
+ T *output_layer_norm_weights)
{
_input_layer_norm_weights = input_layer_norm_weights;
_forget_layer_norm_weights = forget_layer_norm_weights;
@@ -169,7 +174,10 @@ public:
*
* @return Reference to this LSTMParams object
*/
- LSTMParams &set_matmul_scale_params(float input_intermediate_scale, float forget_intermediate_scale, float cell_intermediate_scale, float output_intermediate_scale)
+ LSTMParams &set_matmul_scale_params(float input_intermediate_scale,
+ float forget_intermediate_scale,
+ float cell_intermediate_scale,
+ float output_intermediate_scale)
{
_input_intermediate_scale = input_intermediate_scale;
_forget_intermediate_scale = forget_intermediate_scale;
@@ -338,5 +346,5 @@ private:
bool _has_cifg_opt;
bool _use_layer_norm;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_LSTMPARAMS_H */