aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL
diff options
context:
space:
mode:
authorFelix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-27 17:46:17 +0100
committerfelixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-28 12:08:05 +0000
commitafd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree03bc7d5a762099989b16a656fa8d397b490ed70e /src/runtime/CL
parentbdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
downloadComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz
Apply clang-format on repository
Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/runtime/CL')
-rw-r--r--src/runtime/CL/CLBufferAllocator.cpp3
-rw-r--r--src/runtime/CL/CLGEMMHeuristicsHandle.cpp3
-rw-r--r--src/runtime/CL/CLHelpers.cpp41
-rw-r--r--src/runtime/CL/CLMemory.cpp12
-rw-r--r--src/runtime/CL/CLMemoryRegion.cpp26
-rw-r--r--src/runtime/CL/CLOperator.cpp5
-rw-r--r--src/runtime/CL/CLRuntimeContext.cpp6
-rw-r--r--src/runtime/CL/CLScheduler.cpp47
-rw-r--r--src/runtime/CL/CLSubTensor.cpp10
-rw-r--r--src/runtime/CL/CLTensorAllocator.cpp40
-rw-r--r--src/runtime/CL/CLTuner.cpp90
-rw-r--r--src/runtime/CL/ICLSimpleFunction.cpp5
-rw-r--r--src/runtime/CL/Utils.cpp16
-rw-r--r--src/runtime/CL/functions/CLActivationLayer.cpp22
-rw-r--r--src/runtime/CL/functions/CLArgMinMaxLayer.cpp63
-rw-r--r--src/runtime/CL/functions/CLBatchNormalizationLayer.cpp37
-rw-r--r--src/runtime/CL/functions/CLBatchToSpaceLayer.cpp30
-rw-r--r--src/runtime/CL/functions/CLBitwiseAnd.cpp10
-rw-r--r--src/runtime/CL/functions/CLBitwiseNot.cpp5
-rw-r--r--src/runtime/CL/functions/CLBitwiseOr.cpp10
-rw-r--r--src/runtime/CL/functions/CLBitwiseXor.cpp8
-rw-r--r--src/runtime/CL/functions/CLBoundingBoxTransform.cpp19
-rw-r--r--src/runtime/CL/functions/CLCast.cpp22
-rw-r--r--src/runtime/CL/functions/CLChannelShuffleLayer.cpp7
-rw-r--r--src/runtime/CL/functions/CLComparison.cpp37
-rw-r--r--src/runtime/CL/functions/CLConcatenateLayer.cpp28
-rw-r--r--src/runtime/CL/functions/CLConv3D.cpp39
-rw-r--r--src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp32
-rw-r--r--src/runtime/CL/functions/CLConvolutionLayer.cpp102
-rw-r--r--src/runtime/CL/functions/CLCopy.cpp15
-rw-r--r--src/runtime/CL/functions/CLCrop.cpp48
-rw-r--r--src/runtime/CL/functions/CLCropResize.cpp194
-rw-r--r--src/runtime/CL/functions/CLDeconvolutionLayer.cpp74
-rw-r--r--src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp17
-rw-r--r--src/runtime/CL/functions/CLDepthConvertLayer.cpp26
-rw-r--r--src/runtime/CL/functions/CLDepthToSpaceLayer.cpp8
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp151
-rw-r--r--src/runtime/CL/functions/CLDequantizationLayer.cpp17
-rw-r--r--src/runtime/CL/functions/CLDirectConvolutionLayer.cpp46
-rw-r--r--src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp80
-rw-r--r--src/runtime/CL/functions/CLElementwiseOperations.cpp206
-rw-r--r--src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp78
-rw-r--r--src/runtime/CL/functions/CLFFT1D.cpp30
-rw-r--r--src/runtime/CL/functions/CLFFT2D.cpp16
-rw-r--r--src/runtime/CL/functions/CLFFTConvolutionLayer.cpp123
-rw-r--r--src/runtime/CL/functions/CLFill.cpp17
-rw-r--r--src/runtime/CL/functions/CLFlattenLayer.cpp24
-rw-r--r--src/runtime/CL/functions/CLFloor.cpp12
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp64
-rw-r--r--src/runtime/CL/functions/CLFuseBatchNormalization.cpp57
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp56
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp88
-rw-r--r--src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp196
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp38
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp34
-rw-r--r--src/runtime/CL/functions/CLGather.cpp8
-rw-r--r--src/runtime/CL/functions/CLGenerateProposalsLayer.cpp192
-rw-r--r--src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp44
-rw-r--r--src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp36
-rw-r--r--src/runtime/CL/functions/CLL2NormalizeLayer.cpp10
-rw-r--r--src/runtime/CL/functions/CLLSTMLayer.cpp575
-rw-r--r--src/runtime/CL/functions/CLLSTMLayerQuantized.cpp410
-rw-r--r--src/runtime/CL/functions/CLLogicalAnd.cpp26
-rw-r--r--src/runtime/CL/functions/CLLogicalNot.cpp14
-rw-r--r--src/runtime/CL/functions/CLLogicalOr.cpp26
-rw-r--r--src/runtime/CL/functions/CLMatMul.cpp29
-rw-r--r--src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp21
-rw-r--r--src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp7
-rw-r--r--src/runtime/CL/functions/CLNormalizationLayer.cpp20
-rw-r--r--src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp20
-rw-r--r--src/runtime/CL/functions/CLPReluLayer.cpp23
-rw-r--r--src/runtime/CL/functions/CLPadLayer.cpp41
-rw-r--r--src/runtime/CL/functions/CLPermute.cpp18
-rw-r--r--src/runtime/CL/functions/CLPixelWiseMultiplication.cpp82
-rw-r--r--src/runtime/CL/functions/CLPooling3dLayer.cpp20
-rw-r--r--src/runtime/CL/functions/CLPoolingLayer.cpp31
-rw-r--r--src/runtime/CL/functions/CLPriorBoxLayer.cpp34
-rw-r--r--src/runtime/CL/functions/CLQLSTMLayer.cpp942
-rw-r--r--src/runtime/CL/functions/CLQuantizationLayer.cpp10
-rw-r--r--src/runtime/CL/functions/CLRNNLayer.cpp53
-rw-r--r--src/runtime/CL/functions/CLROIAlignLayer.cpp20
-rw-r--r--src/runtime/CL/functions/CLROIPoolingLayer.cpp19
-rw-r--r--src/runtime/CL/functions/CLRange.cpp5
-rw-r--r--src/runtime/CL/functions/CLReduceMean.cpp90
-rw-r--r--src/runtime/CL/functions/CLReductionOperation.cpp65
-rw-r--r--src/runtime/CL/functions/CLReorgLayer.cpp7
-rw-r--r--src/runtime/CL/functions/CLReshapeLayer.cpp14
-rw-r--r--src/runtime/CL/functions/CLReverse.cpp7
-rw-r--r--src/runtime/CL/functions/CLScale.cpp15
-rw-r--r--src/runtime/CL/functions/CLSelect.cpp8
-rw-r--r--src/runtime/CL/functions/CLSlice.cpp41
-rw-r--r--src/runtime/CL/functions/CLSoftmaxLayer.cpp22
-rw-r--r--src/runtime/CL/functions/CLSpaceToBatchLayer.cpp71
-rw-r--r--src/runtime/CL/functions/CLSpaceToDepthLayer.cpp10
-rw-r--r--src/runtime/CL/functions/CLSplit.cpp3
-rw-r--r--src/runtime/CL/functions/CLStackLayer.cpp21
-rw-r--r--src/runtime/CL/functions/CLStridedSlice.cpp81
-rw-r--r--src/runtime/CL/functions/CLTile.cpp8
-rw-r--r--src/runtime/CL/functions/CLTranspose.cpp12
-rw-r--r--src/runtime/CL/functions/CLUnstack.cpp38
-rw-r--r--src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp65
-rw-r--r--src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp240
-rw-r--r--src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp34
-rw-r--r--src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp169
-rw-r--r--src/runtime/CL/gemm/CLGEMMKernelSelection.h3
-rw-r--r--src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp74
-rw-r--r--src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h7
-rw-r--r--src/runtime/CL/mlgo/Common.h40
-rw-r--r--src/runtime/CL/mlgo/HeuristicTree.cpp89
-rw-r--r--src/runtime/CL/mlgo/HeuristicTree.h24
-rw-r--r--src/runtime/CL/mlgo/MLGOHeuristics.cpp99
-rw-r--r--src/runtime/CL/mlgo/MLGOHeuristics.h6
-rw-r--r--src/runtime/CL/mlgo/MLGOParser.cpp188
-rw-r--r--src/runtime/CL/mlgo/MLGOParser.h9
-rw-r--r--src/runtime/CL/mlgo/Utils.cpp48
-rw-r--r--src/runtime/CL/mlgo/Utils.h10
-rw-r--r--src/runtime/CL/tuners/CLTuningParametersList.cpp50
117 files changed, 4282 insertions, 2612 deletions
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index e06ef3d37d..b4545b93bf 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -35,7 +35,8 @@ namespace arm_compute
void *CLBufferAllocator::allocate(size_t size, size_t alignment)
{
ARM_COMPUTE_UNUSED(alignment);
- cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) };
+ cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size,
+ nullptr, nullptr)};
return static_cast<void *>(buf);
}
diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
index 7168259fcd..d680dc08bb 100644
--- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
+++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
@@ -27,8 +27,7 @@
namespace arm_compute
{
-CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle()
- : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
+CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
{
}
CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default;
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 5b4bbbcde0..eb28ecbf8d 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void
* @return A pointer to the context properties which can be used to create an opencl context
*/
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
+void initialise_context_properties(const cl::Platform &platform,
+ const cl::Device &device,
+ std::array<cl_context_properties, 7> &prop)
{
ARM_COMPUTE_UNUSED(device);
#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
// Query devices in the context for cl_arm_printf support
- if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+ if (arm_compute::device_supports_extension(device, "cl_arm_printf"))
{
// Create a cl_context with a printf_callback and user specified buffer size.
- std::array<cl_context_properties, 7> properties_printf =
- {
+ std::array<cl_context_properties, 7> properties_printf = {
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
// Enable a printf callback function for this context.
CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
// Request a minimum printf buffer size of 4MB for devices in the
// context that support this extension.
- CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
- 0
- };
+ CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0};
prop = properties_printf;
}
else
#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
{
- std::array<cl_context_properties, 3> properties =
- {
- CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
- 0
- };
+ std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM,
+ reinterpret_cast<cl_context_properties>(platform()), 0};
std::copy(properties.begin(), properties.end(), prop.begin());
};
}
@@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
cl::Platform::get(&platforms);
ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
- cl::Platform selected_platform{ nullptr };
+ cl::Platform selected_platform{nullptr};
// If the user has selected the Native platform, return the first available.
- switch(cl_backend_type)
+ switch (cl_backend_type)
{
case CLBackendType::Native:
selected_platform = platforms[0];
break;
case CLBackendType::Clvk:
- for(auto p : platforms)
+ for (auto p : platforms)
{
std::string res = p.getInfo<CL_PLATFORM_NAME>();
- if(res.find("clvk") != std::string::npos)
+ if (res.find("clvk") != std::string::npos)
{
selected_platform = p;
break;
@@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
ARM_COMPUTE_ERROR("Unsupported backend type");
}
- if(!selected_platform())
+ if (!selected_platform())
{
ARM_COMPUTE_ERROR("No valid platform found");
}
@@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
return selected_platform;
}
-std::tuple<cl::Context, cl::Device, cl_int>
-create_opencl_context_and_device(CLBackendType cl_backend_type)
+std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type)
{
ARM_COMPUTE_ERROR_ON(!opencl_is_available());
cl::Platform p = select_preferable_platform(cl_backend_type);
@@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
std::vector<cl::Device> platform_devices;
p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
- device = platform_devices[0];
- cl_int err = CL_SUCCESS;
- std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
+ device = platform_devices[0];
+ cl_int err = CL_SUCCESS;
+ std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0};
initialise_context_properties(p, device, properties);
cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
@@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
- if(ctx)
+ if (ctx)
{
ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
ctx->gpu_scheduler()->enqueue(*kernel, flush);
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index a1743c56e6..c6ee6fde83 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -24,24 +24,22 @@
#include "arm_compute/runtime/CL/CLMemory.h"
#include "arm_compute/core/Error.h"
+
#include "support/Cast.h"
namespace arm_compute
{
-CLMemory::CLMemory()
- : _region(nullptr), _region_owned(nullptr)
+CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr)
{
}
-CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
- : _region(nullptr), _region_owned(memory)
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
{
_region_owned = memory;
_region = _region_owned.get();
}
-CLMemory::CLMemory(ICLMemoryRegion *memory)
- : _region(memory), _region_owned(nullptr)
+CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
{
_region = memory;
}
@@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
_region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
_region = _region_owned.get();
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 00f91a0ffb..835958b816 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -29,10 +29,7 @@
namespace arm_compute
{
ICLMemoryRegion::ICLMemoryRegion(size_t size)
- : IMemoryRegion(size),
- _ctx(CLScheduler::get().context()),
- _mapping(nullptr),
- _mem()
+ : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem()
{
}
@@ -57,17 +54,15 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset,
return nullptr;
}
-CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size)
- : ICLMemoryRegion(size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size)
{
- if(_size != 0)
+ if (_size != 0)
{
_mem = cl::Buffer(CLScheduler::get().context(), flags, _size);
}
}
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
- : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
{
_mem = buffer;
}
@@ -102,10 +97,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
: ICLMemoryRegion(size), _ptr(nullptr)
{
- if(size != 0)
+ if (size != 0)
{
_ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment);
- if(_ptr != nullptr)
+ if (_ptr != nullptr)
{
_mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
}
@@ -114,7 +109,7 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a
ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
{
- if(_ptr != nullptr)
+ if (_ptr != nullptr)
{
try
{
@@ -125,7 +120,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
_mem = cl::Buffer();
clSVMFree(_ctx.get(), _ptr);
}
- catch(...)
+ catch (...)
{
}
}
@@ -144,7 +139,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size
void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
{
ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
- clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+ clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr,
+ nullptr);
_mapping = _ptr;
return _mapping;
}
@@ -163,7 +159,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si
void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
{
- if(blocking)
+ if (blocking)
{
clFinish(q.get());
}
diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp
index 075a544077..89d4520038 100644
--- a/src/runtime/CL/CLOperator.cpp
+++ b/src/runtime/CL/CLOperator.cpp
@@ -30,14 +30,13 @@ namespace arm_compute
{
namespace experimental
{
-ICLOperator::ICLOperator(IRuntimeContext *ctx)
- : _kernel(), _ctx(ctx), _workspace()
+ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
{
}
void ICLOperator::run(ITensorPack &tensors)
{
- if(tensors.empty())
+ if (tensors.empty())
{
ARM_COMPUTE_ERROR("No inputs provided");
}
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 5083b4b0c5..b426b8c304 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -29,7 +30,10 @@
namespace arm_compute
{
CLRuntimeContext::CLRuntimeContext()
- : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type()
+ : _gpu_owned_scheduler(std::make_unique<CLScheduler>()),
+ _gpu_scheduler(_gpu_owned_scheduler.get()),
+ _symbols(),
+ _backend_type()
{
_symbols.load_default();
auto ctx_dev_err = create_opencl_context_and_device(_backend_type);
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index b7a4dff45d..f0a42f55fd 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/runtime/CL/CLTuner.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event()
void CLScheduler::tune_kernel_static(ICLKernel &kernel)
{
- if(_cl_tuner != nullptr)
+ if (_cl_tuner != nullptr)
{
_cl_tuner->tune_kernel_static(kernel);
}
@@ -95,8 +96,16 @@ bool CLScheduler::is_initialised() const
std::once_flag CLScheduler::_initialize_symbols;
CLScheduler::CLScheduler()
- : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native), _job_chaining_enabled(true),
- _job_chaining_size(1), _job_chaining_count(0)
+ : _context(),
+ _queue(),
+ _target(GPUTarget::MIDGARD),
+ _is_initialised(false),
+ _cl_tuner(nullptr),
+ _gemm_heuristics(nullptr),
+ _backend_type(CLBackendType::Native),
+ _job_chaining_enabled(true),
+ _job_chaining_size(1),
+ _job_chaining_count(0)
{
}
@@ -107,9 +116,12 @@ CLScheduler &CLScheduler::get()
return scheduler;
}
-void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h)
+void CLScheduler::default_init_with_context(cl::Device &device,
+ cl::Context &ctx,
+ ICLTuner *cl_tuner,
+ CLGEMMHeuristicsHandle *gemm_h)
{
- if(!_is_initialised)
+ if (!_is_initialised)
{
const std::string cl_kernels_folder("./cl_kernels/");
cl::CommandQueue queue = cl::CommandQueue(ctx, device);
@@ -121,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx
void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
{
- if(!_is_initialised)
+ if (!_is_initialised)
{
cl::Context ctx;
cl::Device dev;
@@ -151,7 +163,12 @@ void CLScheduler::set_context(cl::Context context)
CLKernelLibrary::get().set_context(_context);
}
-void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+void CLScheduler::init(cl::Context context,
+ cl::CommandQueue queue,
+ const cl::Device &device,
+ ICLTuner *cl_tuner,
+ CLGEMMHeuristicsHandle *gemm_h,
+ CLBackendType cl_backend_type)
{
set_context(std::move(context));
_queue = std::move(queue);
@@ -164,21 +181,21 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De
void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
{
- ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
- "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+ ARM_COMPUTE_ERROR_ON_MSG(
+ !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
const bool inject_memory = !tensors.empty();
// Tune the kernel if the CLTuner has been provided
- if(_cl_tuner != nullptr)
+ if (_cl_tuner != nullptr)
{
inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
}
// Run kernel
inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
- if(_job_chaining_enabled)
+ if (_job_chaining_enabled)
{
++_job_chaining_count;
}
@@ -188,9 +205,9 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
void CLScheduler::flush_queue(bool flush)
{
- if(_job_chaining_enabled)
+ if (_job_chaining_enabled)
{
- if(_job_chaining_count >= _job_chaining_size)
+ if (_job_chaining_count >= _job_chaining_size)
{
_job_chaining_count = 0;
/*
@@ -199,14 +216,14 @@ void CLScheduler::flush_queue(bool flush)
the CPU activity for job-scheduling.
For eg. job-chain size goes from 1, 2, 4, 8 and 16
*/
- if(_job_chaining_size < 16)
+ if (_job_chaining_size < 16)
{
_job_chaining_size <<= 1;
}
_queue.flush();
}
}
- else if(flush)
+ else if (flush)
{
_queue.flush();
}
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 14936ae23c..ace820bbb7 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -29,12 +29,14 @@
using namespace arm_compute;
-CLSubTensor::CLSubTensor()
- : _parent(nullptr), _info()
+CLSubTensor::CLSubTensor() : _parent(nullptr), _info()
{
}
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
+CLSubTensor::CLSubTensor(ICLTensor *parent,
+ const TensorShape &tensor_shape,
+ const Coordinates &coords,
+ bool extend_parent)
: _parent(nullptr), _info()
{
ARM_COMPUTE_ERROR_ON(parent == nullptr);
@@ -81,7 +83,7 @@ void CLSubTensor::unmap()
uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
{
ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
- if(_parent->buffer() == nullptr)
+ if (_parent->buffer() == nullptr)
{
_parent->map(q, blocking);
}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index f85b8ae777..e6457218c7 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr;
std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment)
{
// Try fine-grain SVM
- std::unique_ptr<ICLMemoryRegion> region = std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
- size,
- alignment);
+ std::unique_ptr<ICLMemoryRegion> region =
+ std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
// Try coarse-grain SVM in case of failure
- if(region != nullptr && region->ptr() == nullptr)
+ if (region != nullptr && region->ptr() == nullptr)
{
region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment);
}
// Try legacy buffer memory in case of failure
- if(region != nullptr && region->ptr() == nullptr)
+ if (region != nullptr && region->ptr() == nullptr)
{
region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
}
@@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset)
* @param[in] qinfo Quantization info
* @param[in] pad_size Pad size to use in case array needs to be padded for computation purposes
*/
-void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size)
+void populate_quantization_info(CLFloatArray &scale,
+ CLInt32Array &offset,
+ const QuantizationInfo &qinfo,
+ size_t pad_size)
{
clear_quantization_arrays(scale, offset);
@@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
const size_t element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
scale = CLFloatArray(num_elements + pad_size);
scale.resize(num_elements);
- CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
+ CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size,
+ qinfo.scale().data());
- if(!qinfo.offset().empty())
+ if (!qinfo.offset().empty())
{
// Create offset array
- const std::vector<int32_t> &qoffset = qinfo.offset();
- const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
- offset = CLInt32Array(num_elements + pad_size);
+ const std::vector<int32_t> &qoffset = qinfo.offset();
+ const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
+ offset = CLInt32Array(num_elements + pad_size);
offset.resize(num_elements);
- CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data());
+ CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0,
+ num_elements * offset_element_size, qinfo.offset().data());
}
}
} // namespace
@@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext
CLQuantization CLTensorAllocator::quantization() const
{
- return { &_scale, &_offset };
+ return {&_scale, &_offset};
}
uint8_t *CLTensorAllocator::data()
@@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const
void CLTensorAllocator::allocate()
{
// Allocate tensor backing memory
- if(_associated_memory_group == nullptr)
+ if (_associated_memory_group == nullptr)
{
// Perform memory allocation
- if(static_global_cl_allocator != nullptr)
+ if (static_global_cl_allocator != nullptr)
{
_memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0));
}
@@ -146,7 +150,7 @@ void CLTensorAllocator::allocate()
}
// Allocate and fill the quantization parameter arrays
- if(is_data_type_quantized_per_channel(info().data_type()))
+ if (is_data_type_quantized_per_channel(info().data_type()))
{
const size_t pad_size = 0;
populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size);
@@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator)
uint8_t *CLTensorAllocator::lock()
{
- if(_ctx)
+ if (_ctx)
{
return map(_ctx->gpu_scheduler()->queue(), true);
}
@@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock()
void CLTensorAllocator::unlock()
{
ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- if(_ctx)
+ if (_ctx)
{
unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
}
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 445638f01f..0d62fe3afe 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
#include "support/StringSupport.h"
@@ -37,19 +38,23 @@
namespace arm_compute
{
CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
- : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
+ : real_clEnqueueNDRangeKernel(nullptr),
+ _tuning_params_table(),
+ _lws_table(),
+ _kernel_event(),
+ _tune_new_kernels(tune_new_kernels),
+ _tuning_info(tuning_info)
{
}
struct CLTuner::IKernelData
{
- virtual ~IKernelData() = default;
+ virtual ~IKernelData() = default;
virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
};
struct DefaultKernelData : public CLTuner::IKernelData
{
- DefaultKernelData(ITensorPack &tensors)
- : _tensors{ tensors }
+ DefaultKernelData(ITensorPack &tensors) : _tensors{tensors}
{
}
~DefaultKernelData() override = default;
@@ -100,16 +105,17 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
{
// Get the configuration ID from the kernel and append GPU target name and number of available compute units
- const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
+ const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" +
+ support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
// Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned
- if(kernel.config_id() != arm_compute::default_config_id)
+ if (kernel.config_id() != arm_compute::default_config_id)
{
auto p = _tuning_params_table.find(config_id);
- if(p == _tuning_params_table.end())
+ if (p == _tuning_params_table.end())
{
- if(_tune_new_kernels)
+ if (_tune_new_kernels)
{
// Find the optimal LWS for the kernel
CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
@@ -119,7 +125,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
// Set Local-Workgroup-Size
kernel.set_lws_hint(opt_tuning_params.get_lws());
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
}
@@ -129,7 +135,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
{
// Set Local-Workgroup-Size
kernel.set_lws_hint(p->second.get_lws());
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
kernel.set_wbsm_hint(p->second.get_wbsm());
}
@@ -138,7 +144,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
}
void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
{
- DefaultKernelData data{ tensors };
+ DefaultKernelData data{tensors};
do_tune_kernel_dynamic(kernel, &data);
}
@@ -154,7 +160,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
cl::CommandQueue queue_profiler;
// Extract real OpenCL function to intercept
- if(real_clEnqueueNDRangeKernel == nullptr)
+ if (real_clEnqueueNDRangeKernel == nullptr)
{
real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
}
@@ -165,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
// Check if we can use the OpenCL timer with the default queue
cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
- if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
{
// Set the queue for profiling
queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
@@ -176,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
}
// Start intercepting enqueues:
- auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list, cl_event * event)
+ auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo,
+ const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_event *event)
{
- if(this->kernel_event_is_set())
+ if (this->kernel_event_is_set())
{
// If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
return CL_SUCCESS;
}
cl_event tmp;
- cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+ cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws,
+ num_events_in_wait_list, event_wait_list, &tmp);
// Set OpenCL event
this->set_cl_kernel_event(tmp);
- if(event != nullptr)
+ if (event != nullptr)
{
//return cl_event from the intercepted call
clRetainEvent(tmp);
@@ -209,9 +217,10 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
/// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op()
/// Please see COMPMID-5934
cl::NDRange gws = kernel.get_cached_gws();
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
- "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search",
- kernel.config_id().c_str(), to_string(gws).c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(
+ arm_compute::logging::LogLevel::INFO,
+ "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(),
+ to_string(gws).c_str());
queue_profiler.finish();
@@ -224,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
// Construct the list of tuning parameters values to be tested based on the tuner mode.
auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
- for(size_t i = 0; i < tuning_list->size(); ++i)
+ for (size_t i = 0; i < tuning_list->size(); ++i)
{
CLTuningParams tuning_test = (*tuning_list)[i];
// Setting the lws
@@ -234,19 +243,18 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
auto z = lws_test[2];
const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
- if(invalid_lws)
+ if (invalid_lws)
{
continue;
}
kernel.set_lws_hint(lws_test);
- if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+ if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
{
cl_int wbsm_test = tuning_test.get_wbsm();
kernel.set_wbsm_hint(wbsm_test);
}
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
- "[CLTuner] Trying LWS: %s, WBSM: %d",
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d",
to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint());
// Run the kernel
@@ -260,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
_kernel_event = nullptr;
// Check the execution time
- if(diff < min_exec_time)
+ if (diff < min_exec_time)
{
min_exec_time = diff;
opt_tuning_params.set_lws(tuning_test.get_lws());
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
}
@@ -292,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename)
std::ifstream fs;
fs.exceptions(std::ifstream::badbit);
fs.open(filename, std::ios::in);
- if(!fs.is_open())
+ if (!fs.is_open())
{
ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
}
std::string line;
bool header_line = true;
- while(!std::getline(fs, line).fail())
+ while (!std::getline(fs, line).fail())
{
- if(header_line)
+ if (header_line)
{
header_line = false;
size_t pos_lws = line.find("lws");
size_t pos_wbsm = line.find("wbsm");
_tuning_info.tune_wbsm = false;
- if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
+ if (pos_lws != std::string::npos || pos_wbsm != std::string::npos)
{
// The file has in the first line the parameters it has been tuned on
- if(pos_wbsm != std::string::npos)
+ if (pos_wbsm != std::string::npos)
{
_tuning_info.tune_wbsm = true;
}
// Once the line with the tuning parameter is read we can
// read the next one to start collecting the values
- if(std::getline(fs, line).fail())
+ if (std::getline(fs, line).fail())
{
break;
}
@@ -324,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename)
CLTuningParams tuning_params;
size_t pos = line.find(";");
- if(pos == std::string::npos)
+ if (pos == std::string::npos)
{
ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
}
std::string kernel_id = line.substr(0, pos);
line.erase(0, pos + 1);
- if(!tuning_params.from_string(_tuning_info, line))
+ if (!tuning_params.from_string(_tuning_info, line))
{
ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
}
@@ -341,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename)
bool CLTuner::save_to_file(const std::string &filename) const
{
- if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
+ if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
{
return false;
}
@@ -350,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const
fs.open(filename, std::ios::out);
std::string header_string = "";
header_string += "lws";
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
- if(!header_string.empty())
+ if (!header_string.empty())
{
header_string += " ";
}
header_string += "wbsm";
}
fs << header_string << std::endl;
- for(auto const &kernel_data : _tuning_params_table)
+ for (auto const &kernel_data : _tuning_params_table)
{
CLTuningParams tun_pams(kernel_data.second);
fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index 4530537789..bc782c3a2c 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -26,15 +26,14 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
using namespace arm_compute;
ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
- : _kernel(),
- _border_handler(std::make_unique<CLFillBorderKernel>()),
- _ctx(ctx)
+ : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx)
{
}
diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp
index da3d4850bf..294396c28a 100644
--- a/src/runtime/CL/Utils.cpp
+++ b/src/runtime/CL/Utils.cpp
@@ -35,20 +35,20 @@ namespace arm_compute
void restore_program_cache_from_file(const std::string &filename)
{
std::ifstream cache_file(filename, std::ios::binary);
- if(cache_file.is_open())
+ if (cache_file.is_open())
{
- if(!CLScheduler::get().is_initialised())
+ if (!CLScheduler::get().is_initialised())
{
arm_compute::CLScheduler::get().default_init();
}
- while(!cache_file.eof())
+ while (!cache_file.eof())
{
size_t name_len = 0;
size_t binary_len = 0;
cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
- if(name_len == 0 || binary_len == 0)
+ if (name_len == 0 || binary_len == 0)
{
break;
}
@@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename)
tmp.resize(binary_len);
cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
cl::Context context = arm_compute::CLScheduler::get().context();
- cl::Program::Binaries binaries{ binary };
+ cl::Program::Binaries binaries{binary};
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
cl::Program program(context, devices, binaries);
program.build();
@@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename)
void save_program_cache_to_file(const std::string &filename)
{
- if(CLScheduler::get().is_initialised())
+ if (CLScheduler::get().is_initialised())
{
std::ofstream cache_file(filename, std::ios::binary);
- if(cache_file.is_open())
+ if (cache_file.is_open())
{
- for(const auto &it : CLKernelLibrary::get().get_built_programs())
+ for (const auto &it : CLKernelLibrary::get().get_built_programs())
{
std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
ARM_COMPUTE_ERROR_ON(binaries.size() != 1);
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index f324b1a68c..c035644e4a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClActivation.h"
@@ -35,18 +36,17 @@ namespace arm_compute
{
struct CLActivationLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- CLRuntimeContext *ctx{ nullptr };
- std::unique_ptr<opencl::ClActivation> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ CLRuntimeContext *ctx{nullptr};
+ std::unique_ptr<opencl::ClActivation> op{nullptr};
};
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
- : _impl(std::make_unique<Impl>())
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
{
_impl->ctx = ctx;
}
-CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
CLActivationLayer::~CLActivationLayer() = default;
@@ -55,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio
configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
}
-void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -66,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
}
-Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
return opencl::ClActivation::validate(input, output, act_info);
}
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index b30d739025..f9bbd31e8a 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -27,31 +27,39 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/runtime/Utils.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis()
+ : _memory_group(std::move(memory_manager)),
+ _not_reshaped_output(),
+ _arg_min_max_kernel(),
+ _reshape(),
+ _reduction_axis()
{
}
CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
-Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+ "Invalid reduction operation");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
DataType output_data_type = DataType::S32;
@@ -59,17 +67,18 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen
const auto input_num_channles = input->num_channels();
const auto input_qinfo = input->quantization_info();
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
output_data_type = output->data_type();
- const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
}
auto shape_before_reshape = input->tensor_shape();
shape_before_reshape.set(axis, 1);
- auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
- {
+ auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+ QuantizationInfo qinfo) {
ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
};
@@ -85,20 +94,36 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
}
-void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ int axis,
+ ICLTensor *output,
+ const ReductionOperation &op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
_reduction_axis = axis;
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
- DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
- TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() };
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ DataType output_data_type =
+ (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
+
+ TensorShape not_reshaped_output_shape{input->info()->tensor_shape()};
not_reshaped_output_shape.set(axis, 1);
- auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+ auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+ ->clone()
+ ->set_tensor_shape(not_reshaped_output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
_arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
_arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index e8affc0853..0c371c4171 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -30,9 +30,8 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
namespace arm_compute
{
@@ -43,24 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer()
CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+void CLBatchNormalizationLayer::configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
ActivationLayerInfo act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
}
-void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
- const ICLTensor *gamma, float epsilon,
- ActivationLayerInfo act_info)
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
_norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
}
-Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
}
@@ -69,4 +84,4 @@ void CLBatchNormalizationLayer::run()
{
CLScheduler::get().enqueue(*_norm_kernel, true);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index d7a409128d..a3798daf61 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -30,14 +30,12 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
namespace arm_compute
{
-CLBatchToSpaceLayer::CLBatchToSpaceLayer()
- : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
+CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
{
}
@@ -49,29 +47,43 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo
_batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
}
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output)
{
ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
_batch_to_space_kernel->configure(compile_context, input, block_shape, output);
}
-void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(
+ const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
}
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output);
_batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info);
}
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
}
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
}
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index a4712ed3f1..7bfd0e3677 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -23,9 +23,8 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -36,11 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<CLBitwiseKernel>();
k->configure(compile_context, input1, input2, output, BitwiseOperation::AND);
_kernel = std::move(k);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 5964b92447..9763915c02 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -23,9 +23,8 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -43,4 +42,4 @@ void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLT
k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT);
_kernel = std::move(k);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index a07bf17bb2..dd3171b982 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -23,9 +23,8 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -36,11 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseOr::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<CLBitwiseKernel>();
k->configure(compile_context, input1, input2, output, BitwiseOperation::OR);
_kernel = std::move(k);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index f65e2e406c..5bee4b37ec 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -23,9 +23,8 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -36,7 +35,10 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseXor::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<CLBitwiseKernel>();
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 48583bfaf3..76e626fd75 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -23,18 +23,24 @@
*/
#include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
-#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
namespace arm_compute
{
-void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
}
-void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
@@ -44,7 +50,10 @@ void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context,
_kernel = std::move(k);
}
-Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
}
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 10f7cc2065..42ec8f7ee0 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -26,10 +26,10 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
#include <utility>
@@ -37,16 +37,15 @@ namespace arm_compute
{
struct CLCast::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCast> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCast> op{nullptr};
};
-CLCast::CLCast()
- : _impl(std::make_unique<Impl>())
+CLCast::CLCast() : _impl(std::make_unique<Impl>())
{
}
-CLCast::CLCast(CLCast &&) = default;
+CLCast::CLCast(CLCast &&) = default;
CLCast &CLCast::operator=(CLCast &&) = default;
CLCast::~CLCast() = default;
@@ -55,7 +54,10 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy
configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
}
-void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+void CLCast::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ ConvertPolicy policy)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_LOG_PARAMS(input, output, policy);
@@ -74,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con
void CLCast::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 021f28f238..1ee4789816 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -24,9 +24,9 @@
#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
namespace arm_compute
{
@@ -35,7 +35,10 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output,
configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
}
-void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups)
{
ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
auto k = std::make_unique<CLChannelShuffleLayerKernel>();
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 192a266f0f..2f54371e88 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -25,10 +25,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLComparisonKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
namespace arm_compute
{
@@ -37,25 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
}
-void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparison::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation);
auto k = std::make_unique<CLComparisonKernel>();
k->configure(compile_context, input1, input2, output, operation);
_kernel = std::move(k);
- if(output->info()->dimension(0) > 1)
+ if (output->info()->dimension(0) > 1)
{
ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
- if(broadcasted_info->info()->dimension(0) == 1)
+ if (broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+ BorderMode::REPLICATE);
}
}
}
-Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparison::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation)
{
return CLComparisonKernel::validate(input1, input2, output, operation);
}
@@ -67,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC
}
template <ComparisonOperation COP>
-void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output)
{
auto k = std::make_unique<CLComparisonKernel>();
k->configure(compile_context, input1, input2, output, COP);
_kernel = std::move(k);
- if(output->info()->dimension(0) > 1)
+ if (output->info()->dimension(0) > 1)
{
ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
- if(broadcasted_info->info()->dimension(0) == 1)
+ if (broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+ BorderMode::REPLICATE);
}
}
}
template <ComparisonOperation COP>
-Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status
+CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
{
return CLComparisonKernel::validate(input1, input2, output, COP);
}
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0a8884f4e3..9df1c34593 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -24,24 +24,23 @@
#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConcatenate.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
namespace arm_compute
{
struct CLConcatenateLayer::Impl
{
std::vector<const ICLTensor *> srcs{};
- ICLTensor *dst{ nullptr };
- unsigned int num_inputs{ 0 };
- unsigned int axis{ 0 };
- std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
+ ICLTensor *dst{nullptr};
+ unsigned int num_inputs{0};
+ unsigned int axis{0};
+ std::unique_ptr<opencl::ClConcatenate> op{nullptr};
};
-CLConcatenateLayer::CLConcatenateLayer()
- : _impl(std::make_unique<Impl>())
+CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>())
{
}
@@ -56,7 +55,10 @@ void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector
configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
}
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(const CLCompileContext &compile_context,
+ std::vector<const ICLTensor *> &inputs_vector,
+ ICLTensor *output,
+ size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis);
@@ -68,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
_impl->op = std::make_unique<opencl::ClConcatenate>();
std::vector<ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+ for (unsigned int i = 0; i < inputs_vector.size(); ++i)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -76,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
_impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
}
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+ const ITensorInfo *output,
+ size_t axis)
{
return opencl::ClConcatenate::validate(inputs_vector, output, axis);
}
@@ -84,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
void CLConcatenateLayer::run()
{
ITensorPack pack;
- for(unsigned i = 0; i < _impl->num_inputs; ++i)
+ for (unsigned i = 0; i < _impl->num_inputs; ++i)
{
pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
}
diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp
index 729b973b6a..9d1b368f72 100644
--- a/src/runtime/CL/functions/CLConv3D.cpp
+++ b/src/runtime/CL/functions/CLConv3D.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLConv3D.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/gpu/cl/operators/ClDirectConv3d.h"
namespace arm_compute
@@ -32,29 +33,38 @@ using namespace arm_compute::experimental;
struct CLConv3D::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *weights{ nullptr };
- const ICLTensor *biases{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClDirectConv3d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClDirectConv3d> op{nullptr};
};
-CLConv3D::CLConv3D()
- : _impl(std::make_unique<Impl>())
+CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>())
{
}
CLConv3D::~CLConv3D() = default;
-void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const ICLTensor *src,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *dst,
+ const Conv3dInfo &conv3d_info)
{
configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info);
}
-void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const CLCompileContext &compile_context,
+ const ICLTensor *src,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *dst,
+ const Conv3dInfo &conv3d_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(
+ src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
_impl->src = src;
_impl->weights = weights;
@@ -62,10 +72,15 @@ void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTenso
_impl->dst = dst;
_impl->op = std::make_unique<opencl::ClDirectConv3d>();
- _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
+ _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(),
+ _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
}
-Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status CLConv3D::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info)
{
return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info);
}
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index b3efe5c8a0..2298f2a669 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -27,33 +27,37 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
namespace arm_compute
{
struct CLConvertFullyConnectedWeights::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr};
};
-CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights()
- : _impl(std::make_unique<Impl>())
+CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
{
}
CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default;
-void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
}
-void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout);
@@ -63,8 +67,10 @@ void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_c
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
}
-Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f3c05adb47..7767b45a01 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -28,11 +28,11 @@
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClConv2d.h"
-
-#include "src/common/utils/Log.h"
#include "support/Cast.h"
namespace arm_compute
@@ -43,41 +43,59 @@ struct CLConvolutionLayer::Impl
{
MemoryGroup memory_group{};
std::shared_ptr<IMemoryManager> memory_manager{};
- std::unique_ptr<opencl::IClOperator> op{ nullptr };
+ std::unique_ptr<opencl::IClOperator> op{nullptr};
ITensorPack run_pack{};
ITensorPack prep_pack{};
WorkspaceData<CLTensor> workspace{};
experimental::MemoryRequirements aux_mem_req{};
- std::unique_ptr<IFunction> func{ nullptr };
+ std::unique_ptr<IFunction> func{nullptr};
};
-CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _impl(std::make_unique<Impl>())
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
_impl->memory_manager = std::move(memory_manager);
}
CLConvolutionLayer::~CLConvolutionLayer() = default;
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+ dilation, act_info, enable_fast_math, num_groups);
}
-void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
- enable_fast_math, num_groups));
- ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(
+ input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+ weights_info, dilation, act_info, enable_fast_math, num_groups));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
- weights_info, CLScheduler::get().target()))
+ switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
+ weights_info, CLScheduler::get().target()))
{
case ConvolutionMethod::WINOGRAD:
case ConvolutionMethod::DIRECT:
@@ -85,7 +103,8 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
case ConvolutionMethod::GEMM:
{
auto f = std::make_unique<opencl::ClConv2d>();
- f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+ f->configure(compile_context, input->info(), weights->info(),
+ ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
_impl->op = std::move(f);
break;
}
@@ -101,40 +120,52 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
break;
}
- if(_impl->op)
+ if (_impl->op)
{
_impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
_impl->aux_mem_req = _impl->op->workspace();
- _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
- _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
- _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
}
-Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CLConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW),
+ "Grouping (num_groups != 1) with NHWC data layout is not supported");
const GPUTarget gpu_target = CLScheduler::get().target();
const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
+ switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
{
case ConvolutionMethod::WINOGRAD:
case ConvolutionMethod::DIRECT:
case ConvolutionMethod::INDIRECT:
case ConvolutionMethod::GEMM:
{
- ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
break;
}
case ConvolutionMethod::FFT:
{
// Validate FFT-based convolution layer
- ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info,
+ act_info, enable_fast_math));
break;
}
default:
@@ -145,8 +176,15 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
return Status{};
}
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const ActivationLayerInfo &act_info,
+ const GPUTarget gpu_target,
+ const Size2D &dilation,
+ bool enable_fast_math)
{
const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1);
return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target);
@@ -158,7 +196,7 @@ void CLConvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_impl->memory_group);
- if(_impl->func)
+ if (_impl->func)
{
_impl->func->run();
}
@@ -170,7 +208,7 @@ void CLConvolutionLayer::run()
void CLConvolutionLayer::prepare()
{
- if(_impl->func)
+ if (_impl->func)
{
_impl->func->prepare();
}
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 56400b67a0..a4f2b0634f 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -27,10 +27,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCopy.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCopy.h"
#include <utility>
@@ -38,16 +38,15 @@ namespace arm_compute
{
struct CLCopy::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCopy> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCopy> op{nullptr};
};
-CLCopy::CLCopy()
- : _impl(std::make_unique<Impl>())
+CLCopy::CLCopy() : _impl(std::make_unique<Impl>())
{
}
-CLCopy::CLCopy(CLCopy &&) = default;
+CLCopy::CLCopy(CLCopy &&) = default;
CLCopy &CLCopy::operator=(CLCopy &&) = default;
CLCopy::~CLCopy() = default;
diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
index 35ea17cfc2..fc29c43827 100644
--- a/src/runtime/CL/functions/CLCrop.cpp
+++ b/src/runtime/CL/functions/CLCrop.cpp
@@ -27,10 +27,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCrop.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCrop.h"
#include <utility>
@@ -38,27 +38,38 @@ namespace arm_compute
{
struct CLCrop::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCrop> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCrop> op{nullptr};
};
-CLCrop::CLCrop()
- : _impl(std::make_unique<Impl>())
+CLCrop::CLCrop() : _impl(std::make_unique<Impl>())
{
}
-CLCrop::CLCrop(CLCrop &&) = default;
+CLCrop::CLCrop(CLCrop &&) = default;
CLCrop &CLCrop::operator=(CLCrop &&) = default;
CLCrop::~CLCrop() = default;
-void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
- Window *dst_window)
+void CLCrop::configure(const ICLTensor *src,
+ ICLTensor *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
{
- configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
+ configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value,
+ dst_window);
}
-void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
- Window *dst_window)
+void CLCrop::configure(const CLCompileContext &compile_context,
+ const ICLTensor *src,
+ ICLTensor *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
@@ -67,10 +78,17 @@ void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor
_impl->dst = dst;
_impl->op = std::make_unique<opencl::ClCrop>();
- _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window);
+ _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index,
+ extrapolation_value, dst_window);
}
-Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status CLCrop::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
{
return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window);
}
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index d8fc38d99e..821412b149 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,19 +25,26 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/common/utils/Log.h"
-
#include <cstddef>
namespace arm_compute
{
namespace
{
-inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+inline void configure_crop(const ICLTensor *input,
+ ICLTensor *crop_boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ uint32_t crop_box_ind,
+ Coordinates &start,
+ Coordinates &end,
+ uint32_t &batch_index)
{
batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
@@ -50,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen
// The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
- end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
- std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
- const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
+ end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+ const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1,
+ static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
output->info()->set_tensor_shape(out_shape);
}
} // namespace
CLCropResize::CLCropResize()
- : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions()
+ : _input(nullptr),
+ _boxes(nullptr),
+ _box_ind(nullptr),
+ _output(nullptr),
+ _num_boxes(0),
+ _method(),
+ _extrapolation_value(0),
+ _scale(),
+ _copy(),
+ _crop_results(),
+ _scaled_results(),
+ _internal_functions()
{
}
CLCropResize::~CLCropResize() = default;
-Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
- Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status CLCropResize::validate(const ITensorInfo *input,
+ ITensorInfo *boxes,
+ ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
TensorInfo temp_info;
- ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
- if(output->total_size() > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1},
+ input->dimension(3) - 1, extrapolation_value));
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,20 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen
return Status{};
}
-void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
- InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const ICLTensor *input,
+ ICLTensor *boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+ configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method,
+ extrapolation_value);
}
-void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
- InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
- ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+ ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+ crop_size, method, extrapolation_value));
ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
- TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+ TensorShape output_shape =
+ TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
_num_boxes = boxes->info()->tensor_shape()[1];
@@ -122,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
// kernels used for cropping and scaling.
_boxes->map(CLScheduler::get().queue());
_box_ind->map(CLScheduler::get().queue());
- for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
+ for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
{
auto crop_tensor = std::make_unique<CLTensor>();
TensorInfo crop_result_info(1, DataType::F32);
@@ -143,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
auto scale_kernel = std::make_unique<CLScale>();
- scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT });
+ scale_kernel->configure(
+ compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(),
+ ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT});
_scale.emplace_back(std::move(scale_kernel));
Window win = calculate_max_window(*_output->info());
@@ -159,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
bool is_width_flipped = end[0] < start[0];
bool is_height_flipped = end[1] < start[1];
/** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
- std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+ std::array<int32_t, 2> rows_out_of_bounds{0};
/** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
- std::array<int32_t, 2> cols_out_of_bounds{ 0 };
- if(is_height_flipped)
+ std::array<int32_t, 2> cols_out_of_bounds{0};
+ if (is_height_flipped)
{
- rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
- rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+ rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(start[1] - _input->info()->dimension(2) + 1,
+ _crop_results[num_box].get()->info()->dimension(2))
+ : 0;
+ rows_out_of_bounds[1] =
+ end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+ : 0;
}
else
{
- rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
- rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+ rows_out_of_bounds[0] =
+ start[1] < 0
+ ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+ : 0;
+ rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(end[1] - _input->info()->dimension(2) + 1,
+ _crop_results[num_box].get()->info()->dimension(2))
+ : 0;
}
- if(is_width_flipped)
+ if (is_width_flipped)
{
- cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
- cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+ cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(start[0] - _input->info()->dimension(1) + 1,
+ _crop_results[num_box].get()->info()->dimension(1))
+ : 0;
+ cols_out_of_bounds[1] =
+ end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+ : 0;
}
else
{
- cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
- cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+ cols_out_of_bounds[0] =
+ start[0] < 0
+ ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+ : 0;
+ cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(end[0] - _input->info()->dimension(1) + 1,
+ _crop_results[num_box].get()->info()->dimension(1))
+ : 0;
}
Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
@@ -203,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
// Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
// with the extrapolation value using memset.
// First for the rows before the in bounds rows.
- if(rows_out_of_bounds[0] > 0)
+ if (rows_out_of_bounds[0] > 0)
{
Window slice_fill_rows_before(full_window);
slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_rows_before);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
Window slice_in(full_window);
- slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
- slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
-
- int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
- if(rows_in_bounds > 0)
+ slice_in.set(2,
+ Window::Dimension(rows_out_of_bounds[0],
+ _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+ slice_in.set(1,
+ Window::Dimension(cols_out_of_bounds[0],
+ _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+ int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) -
+ rows_out_of_bounds[0] - rows_out_of_bounds[1];
+ if (rows_in_bounds > 0)
{
// Fill all elements that share a row with an in bounds element with the extrapolation value.
- if(cols_out_of_bounds[0] > 0)
+ if (cols_out_of_bounds[0] > 0)
{
Window slice_fill_cols_before(slice_in);
slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_cols_before);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
- if(cols_out_of_bounds[1] > 0)
+ if (cols_out_of_bounds[1] > 0)
{
Window slice_fill_cols_after(slice_in);
- slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+ slice_fill_cols_after.set(
+ 1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1],
+ _crop_results[num_box].get()->info()->dimension(1), 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_cols_after);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
// Copy all elements within the input bounds from the input tensor.
- int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
- if(cols_in_bounds > 0)
+ int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) -
+ cols_out_of_bounds[0] - cols_out_of_bounds[1];
+ if (cols_in_bounds > 0)
{
- Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
- is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
- Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
- is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+ Coordinates2D start_in{
+ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+ is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]};
+ Coordinates2D end_in{
+ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+ is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1};
auto kernel = std::make_unique<CLCrop>();
- kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+ kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index,
+ extrapolation_value, &slice_in);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
}
// Fill all rows after the in bounds elements with the extrapolation value.
- if(rows_out_of_bounds[1] > 0)
+ if (rows_out_of_bounds[1] > 0)
{
Window slice_fill_rows_after(full_window);
- slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+ slice_fill_rows_after.set(
+ 2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1],
+ _crop_results[num_box].get()->info()->dimension(2), 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_rows_after);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
@@ -277,18 +357,18 @@ void CLCropResize::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
- for(unsigned int i = 0; i < _internal_functions.size(); ++i)
+ for (unsigned int i = 0; i < _internal_functions.size(); ++i)
{
_internal_functions[i]->run();
}
CLScheduler::get().sync();
- for(auto &kernel : _scale)
+ for (auto &kernel : _scale)
{
kernel->run();
}
CLScheduler::get().sync();
- for(auto &kernel : _copy)
+ for (auto &kernel : _copy)
{
kernel->run();
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 4421a18f2a..e988ab0ac4 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -25,16 +25,16 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/IClOperator.h"
#include "src/gpu/cl/operators/ClTransposedConvolution.h"
-#include "src/common/utils/Log.h"
-
#include <cmath>
#include <memory>
#include <tuple>
@@ -44,11 +44,11 @@ using namespace arm_compute::misc::shape_calculator;
struct CLDeconvolutionLayer::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *weights{ nullptr };
- const ICLTensor *biases{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::IClOperator> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::IClOperator> op{nullptr};
};
CLDeconvolutionLayer::~CLDeconvolutionLayer() = default;
@@ -58,24 +58,35 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
{
}
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
}
-void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info);
- switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+ switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(),
+ deconv_info, weights_info))
{
case DeconvolutionMethod::DIRECT:
{
auto op = std::make_unique<opencl::ClTransposedConvolution>();
- op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info);
+ op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr,
+ output->info(), deconv_info);
_impl->src = input;
_impl->weights = weights;
@@ -105,22 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC
}
}
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+ switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
{
case DeconvolutionMethod::DIRECT:
{
// Validate transposed convolution operator
- ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
break;
}
case DeconvolutionMethod::UPSCALE_CONV2D:
{
// Validate direct convolution layer
- ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
break;
}
case DeconvolutionMethod::GEMM:
@@ -137,12 +154,16 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
return Status{};
}
-DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_UNUSED(output, bias, weights_info);
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
return DeconvolutionMethod::UPSCALE_CONV2D;
}
@@ -154,11 +175,12 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor
const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
const size_t ofm = weights->tensor_shape()[idx_n];
- if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
+ if (weights->dimension(idx_w) != deconv_info.stride().first ||
+ weights->dimension(idx_h) != deconv_info.stride().second)
{
// We observe better performance for FP32 types only when ofm <= 16.
// A better heuristic is required for selecting the method for FP16 data types.
- if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
+ if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
{
return DeconvolutionMethod::DIRECT;
}
@@ -175,7 +197,7 @@ void CLDeconvolutionLayer::run()
{
prepare();
- if(_impl->op != nullptr)
+ if (_impl->op != nullptr)
{
// Optimized Operator will be used
ITensorPack pack;
@@ -195,7 +217,7 @@ void CLDeconvolutionLayer::run()
void CLDeconvolutionLayer::prepare()
{
- if(_impl->op == nullptr)
+ if (_impl->op == nullptr)
{
_function->prepare();
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 0b428f5b17..b92bf903a6 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -27,22 +27,21 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
namespace arm_compute
{
CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
- : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
- _fill(),
- _output(nullptr)
+ : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr)
{
}
CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
+Status
+CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
{
return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
}
@@ -52,13 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output
configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
}
-void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_LOG_PARAMS(input, output, info);
_output = output;
- _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+ _fill.configure(compile_context, _output,
+ PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
_upsample->configure(compile_context, input, _output, info);
}
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index cac3f51013..6d2fea974e 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -26,10 +26,10 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
#include <utility>
@@ -37,16 +37,15 @@ namespace arm_compute
{
struct CLDepthConvertLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCast> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCast> op{nullptr};
};
-CLDepthConvertLayer::CLDepthConvertLayer()
- : _impl(std::make_unique<Impl>())
+CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>())
{
}
-CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
CLDepthConvertLayer::~CLDepthConvertLayer() = default;
@@ -55,7 +54,11 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C
configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
}
-void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ ConvertPolicy policy,
+ uint32_t shift)
{
ARM_COMPUTE_UNUSED(shift);
ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift);
@@ -70,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
}
-Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
return opencl::ClCast::validate(input, output, policy);
@@ -78,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
void CLDepthConvertLayer::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 98531e7cac..9477c7f81d 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -23,9 +23,8 @@
*/
#include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
-#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
#include <utility>
@@ -36,7 +35,10 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
auto k = std::make_unique<CLDepthToSpaceLayerKernel>();
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index dcb982fa56..873601bb11 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -29,12 +29,12 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
using namespace arm_compute::misc;
@@ -63,25 +63,33 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemory
CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info,
+ const Size2D &dilation)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier,
+ act_info, dilation);
}
-void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
- ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output != nullptr ? output->info() : input->info(),
- conv_info,
- depth_multiplier,
- act_info,
- dilation));
+ ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr,
+ output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation));
ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
_is_quantized = is_data_type_quantized(input->info()->data_type());
@@ -96,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
ICLTensor *input_to_use = input;
const ICLTensor *weights_to_use = weights;
ICLTensor *output_to_use = output;
- if(_needs_permute)
+ if (_needs_permute)
{
_memory_group.manage(&_permuted_input);
_memory_group.manage(&_permuted_output);
@@ -119,10 +127,12 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
CLTensor *output_multipliers_to_use = nullptr;
CLTensor *output_shifts_to_use = nullptr;
- if(_is_quantized)
+ if (_is_quantized)
{
- const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+ const size_t idx_c =
+ get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t num_filters =
+ (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
_output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
_output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
@@ -132,16 +142,18 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
}
// Get the depthwise convolution compute parameters
- auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
- const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info =
+ t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
- const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+ const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
_dwc_native_kernel->set_target(gpu_target);
_dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
- dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use);
+ dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use,
+ output_shifts_to_use);
- if(_needs_permute)
+ if (_needs_permute)
{
_permuted_input.allocator()->allocate();
@@ -151,22 +163,27 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
_permuted_output.allocator()->allocate();
}
- if(_is_quantized)
+ if (_is_quantized)
{
_output_multipliers.allocator()->allocate();
_output_shifts.allocator()->allocate();
}
}
-Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
const bool in_place = input == output || output == nullptr;
- if(in_place)
+ if (in_place)
{
output = input;
}
@@ -174,21 +191,23 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) >
+ input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) >
+ input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
const GPUTarget gpu_target = CLScheduler::get().target();
- const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+ const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
const bool needs_permute = input->data_layout() == DataLayout::NCHW;
const bool is_quantized = is_data_type_quantized(input->data_type());
TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
- if(is_quantized)
+ if (is_quantized)
{
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
@@ -201,40 +220,57 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
}
}
- if(needs_permute)
+ if (needs_permute)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
TensorShape permuted_input_shape = input->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
- const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+ const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation};
+ TensorShape permuted_output_shape =
+ shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
- const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
- const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
- const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_input = input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_input_shape)
+ .set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_weights = weights->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_weights_shape)
+ .set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_output = output->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_output_shape)
+ .set_data_layout(DataLayout::NHWC);
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
// Get the depthwise convolution compute parameters
- auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
- const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info =
+ t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
- dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+ &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info,
+ &output_multipliers_shifts_info, &output_multipliers_shifts_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
}
else
{
// Get the depthwise convolution compute parameters
- auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
- const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
- &output_multipliers_shifts_info));
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info =
+ t->configure(input, weights, conv_info, dilation, depth_multiplier);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+ input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
+ &output_multipliers_shifts_info));
}
return Status{};
}
@@ -245,12 +281,12 @@ void CLDepthwiseConvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_input_to_nhwc.run();
}
CLScheduler::get().enqueue(*_dwc_native_kernel);
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_output_to_nchw.run();
}
@@ -258,22 +294,21 @@ void CLDepthwiseConvolutionLayer::run()
void CLDepthwiseConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
- if(_is_quantized)
+ if (_is_quantized)
{
_output_multipliers.map();
_output_shifts.map();
- quantization::compute_quantized_multipliers_and_shifts(_input->info(),
- _original_weights->info(),
- _output != nullptr ? _output->info() : _input->info(),
- reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
- reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+ quantization::compute_quantized_multipliers_and_shifts(
+ _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(),
+ reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+ reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
_output_multipliers.unmap();
_output_shifts.unmap();
}
- if(_needs_permute)
+ if (_needs_permute)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 64c6b5d91c..20162a03db 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -26,22 +26,21 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClDequantize.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
namespace arm_compute
{
struct CLDequantizationLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClDequantize> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClDequantize> op{nullptr};
};
-CLDequantizationLayer::CLDequantizationLayer()
- : _impl(std::make_unique<Impl>())
+CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>())
{
}
CLDequantizationLayer::~CLDequantizationLayer() = default;
@@ -51,7 +50,9 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
configure(CLKernelLibrary::get().get_compile_context(), input, output);
}
-void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output)
{
ARM_COMPUTE_LOG_PARAMS(input, output);
_impl->src = input;
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 752e0e4a60..d6dae0d732 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -28,37 +28,46 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/operators/ClActivation.h"
-#include "src/gpu/cl/operators/ClDirectConv2d.h"
#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
namespace arm_compute
{
struct CLDirectConvolutionLayer::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *weights{ nullptr };
- const ICLTensor *biases{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClDirectConv2d> op{nullptr};
};
-CLDirectConvolutionLayer::CLDirectConvolutionLayer()
- : _impl(std::make_unique<Impl>())
+CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>())
{
}
-CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
+CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default;
CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default;
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
}
-void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -69,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClDirectConv2d>();
- _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
}
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLDirectConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -87,4 +101,4 @@ void CLDirectConvolutionLayer::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 88c3c6193c..3717f30ae1 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -26,15 +26,15 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
-#include "src/common/utils/Log.h"
-
#include <memory>
#include <tuple>
@@ -55,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa
{
}
-Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info)
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
const DataLayout data_layout = input->data_layout();
@@ -70,20 +75,22 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1);
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+ weights->dimension(idx_w), weights->dimension(idx_h), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- if(input->data_type() != weights->data_type())
+ if (input->data_type() != weights->data_type())
{
- ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL ||
+ !is_data_type_quantized_asymmetric(input->data_type()));
}
- if(bias != nullptr)
+ if (bias != nullptr)
{
- if(is_data_type_quantized_asymmetric(input->data_type()))
+ if (is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -102,24 +109,39 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
unsigned int deconv_pad_y = 0;
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+ out_dims, deconv_pad_x, deconv_pad_y);
+ TensorInfo scale_out_info(input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(scale_out_shape)
+ .set_data_layout(data_layout));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
return Status{};
}
-void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
}
-void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info);
@@ -141,15 +163,19 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
_flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
+ auto out_dims =
+ deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
_is_prepared = weights_info.retain_internal_weights();
@@ -158,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
// Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
unsigned int deconv_pad_x = 0;
unsigned int deconv_pad_y = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+ *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0;
unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
@@ -179,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
_scaled_output.allocator()->init(scale_out_info);
// configure scale function
- const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+ const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top,
+ deconv_pad_bottom, DimensionRoundingType::FLOOR);
_scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
// Setup the function to convolve the upscaled output
@@ -191,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
_flip_axis.allocator()->allocate();
_flip_axis.map(true);
auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
- if(weights->info()->data_layout() == DataLayout::NHWC)
+ if (weights->info()->data_layout() == DataLayout::NHWC)
{
axis_data[0] = 1;
axis_data[1] = 2;
@@ -216,7 +244,7 @@ void CLDirectDeconvolutionLayer::run()
void CLDirectDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
@@ -229,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare()
_conv_f.prepare();
// Free flipped weights
- if(!_weights_flipped.is_used())
+ if (!_weights_flipped.is_used())
{
_weights_flipped.allocator()->free();
}
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 936b37fb31..d9529f0b7f 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -26,8 +26,8 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClAdd.h"
#include "src/gpu/cl/operators/ClElementwiseOperations.h"
#include "src/gpu/cl/operators/ClSub.h"
@@ -36,26 +36,30 @@ namespace arm_compute
{
struct CLArithmeticAddition::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClAdd> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClAdd> op{nullptr};
};
-CLArithmeticAddition::CLArithmeticAddition()
- : _impl(std::make_unique<Impl>())
+CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>())
{
}
-CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
CLArithmeticAddition::~CLArithmeticAddition() = default;
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(
+ ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
}
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticAddition::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
@@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
}
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticAddition::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
}
@@ -82,26 +90,33 @@ void CLArithmeticAddition::run()
struct CLArithmeticSubtraction::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClSub> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClSub> op{nullptr};
};
-CLArithmeticSubtraction::CLArithmeticSubtraction()
- : _impl(std::make_unique<Impl>())
+CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>())
{
}
-CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
CLArithmeticSubtraction::~CLArithmeticSubtraction() = default;
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
}
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
@@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
}
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClSub::validate(input1, input2, output, policy, act_info);
}
@@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run()
struct CLArithmeticDivision::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr};
};
-CLArithmeticDivision::CLArithmeticDivision()
- : _impl(std::make_unique<Impl>())
+CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>())
{
}
-CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
CLArithmeticDivision::~CLArithmeticDivision() = default;
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLArithmeticDivision::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info);
}
@@ -173,26 +201,32 @@ void CLArithmeticDivision::run()
struct CLElementwiseMax::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseMax> op{nullptr};
};
-CLElementwiseMax::CLElementwiseMax()
- : _impl(std::make_unique<Impl>())
+CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>())
{
}
-CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
CLElementwiseMax::~CLElementwiseMax() = default;
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMax::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseMax::validate(input1, input2, output, act_info);
}
@@ -218,26 +255,32 @@ void CLElementwiseMax::run()
struct CLElementwiseMin::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseMin> op{nullptr};
};
-CLElementwiseMin::CLElementwiseMin()
- : _impl(std::make_unique<Impl>())
+CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>())
{
}
-CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
CLElementwiseMin::~CLElementwiseMin() = default;
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMin::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseMin::validate(input1, input2, output, act_info);
}
@@ -263,26 +309,32 @@ void CLElementwiseMin::run()
struct CLElementwiseSquaredDiff::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr};
};
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
- : _impl(std::make_unique<Impl>())
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
{
}
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff() = default;
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info);
}
@@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run()
struct CLElementwisePower::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwisePower> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwisePower> op{nullptr};
};
-CLElementwisePower::CLElementwisePower()
- : _impl(std::make_unique<Impl>())
+CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>())
{
}
-CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
CLElementwisePower::~CLElementwisePower() = default;
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwisePower::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwisePower::validate(input1, input2, output, act_info);
}
diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index 9dcd2d1891..3043c26feb 100644
--- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClElementwiseUnary.h"
@@ -32,17 +33,16 @@ namespace arm_compute
{
struct CLRsqrtLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClRsqrt> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClRsqrt> op{nullptr};
};
-CLRsqrtLayer::CLRsqrtLayer()
- : _impl(std::make_unique<Impl>())
+CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>())
{
}
-CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
CLRsqrtLayer::~CLRsqrtLayer() = default;
@@ -74,17 +74,16 @@ void CLRsqrtLayer::run()
struct CLExpLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClExp> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClExp> op{nullptr};
};
-CLExpLayer::CLExpLayer()
- : _impl(std::make_unique<Impl>())
+CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>())
{
}
-CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
+CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
CLExpLayer::~CLExpLayer() = default;
@@ -116,17 +115,16 @@ void CLExpLayer::run()
struct CLNegLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClNeg> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClNeg> op{nullptr};
};
-CLNegLayer::CLNegLayer()
- : _impl(std::make_unique<Impl>())
+CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>())
{
}
-CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
+CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
CLNegLayer::~CLNegLayer() = default;
@@ -157,17 +155,16 @@ void CLNegLayer::run()
struct CLSinLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClSin> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClSin> op{nullptr};
};
-CLSinLayer::CLSinLayer()
- : _impl(std::make_unique<Impl>())
+CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>())
{
}
-CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
+CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
CLSinLayer::~CLSinLayer() = default;
@@ -198,17 +195,16 @@ void CLSinLayer::run()
struct CLAbsLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClAbs> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClAbs> op{nullptr};
};
-CLAbsLayer::CLAbsLayer()
- : _impl(std::make_unique<Impl>())
+CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>())
{
}
-CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
CLAbsLayer::~CLAbsLayer() = default;
@@ -239,17 +235,16 @@ void CLAbsLayer::run()
struct CLLogLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClLog> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClLog> op{nullptr};
};
-CLLogLayer::CLLogLayer()
- : _impl(std::make_unique<Impl>())
+CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>())
{
}
-CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
+CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
CLLogLayer::~CLLogLayer() = default;
@@ -280,17 +275,16 @@ void CLLogLayer::run()
struct CLRoundLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClRound> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClRound> op{nullptr};
};
-CLRoundLayer::CLRoundLayer()
- : _impl(std::make_unique<Impl>())
+CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>())
{
}
-CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
CLRoundLayer::~CLRoundLayer() = default;
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index bd0966b65f..48e9ae824a 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -26,13 +26,13 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
#include "src/core/CL/kernels/CLFFTScaleKernel.h"
#include "src/core/utils/helpers/fft.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,7 +54,10 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+void CLFFT1D::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const FFT1DInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
@@ -77,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
_digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
_memory_group.manage(&_digit_reversed_input);
- _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+ _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices,
+ digit_reverse_config);
// Create and configure FFT kernels
unsigned int Nx = 1;
_num_ffts = decomposed_vector.size();
_fft_kernels.reserve(_num_ffts);
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
const unsigned int radix_for_stage = decomposed_vector.at(i);
@@ -93,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
fft_kernel_info.Nx = Nx;
fft_kernel_info.is_first_stage = (i == 0);
_fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>());
- _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+ _fft_kernels.back()->configure(compile_context, &_digit_reversed_input,
+ ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
Nx *= radix_for_stage;
}
// Configure scale kernel
- if(_run_scale)
+ if (_run_scale)
{
FFTScaleKernelInfo scale_config;
scale_config.scale = static_cast<float>(N);
scale_config.conjugate = config.direction == FFTDirection::Inverse;
- is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+ is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config)
+ : _scale_kernel->configure(output, nullptr, scale_config);
}
// Allocate tensors
@@ -123,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
// Check if FFT is decomposable
const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
@@ -132,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
@@ -151,13 +157,13 @@ void CLFFT1D::run()
CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
// Run radix kernels
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
}
// Run output scaling
- if(_run_scale)
+ if (_run_scale)
{
CLScheduler::get().enqueue(*_scale_kernel, true);
}
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 94fc411355..3857046719 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -26,16 +26,19 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
#include "src/core/CL/kernels/CLFFTScaleKernel.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+ : _memory_group(memory_manager),
+ _first_pass_func(memory_manager),
+ _second_pass_func(memory_manager),
+ _first_pass_tensor()
{
}
@@ -46,7 +49,10 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+void CLFFT2D::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const FFT2DInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
@@ -88,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index d12e2de3bf..3894b10785 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -25,10 +25,12 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
#include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -38,8 +40,6 @@
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/utils/helpers/fft.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
namespace
@@ -50,11 +50,11 @@ int pad_decomposable(int N)
int pad = 0;
bool is_decomposed = false;
- while(!is_decomposed)
+ while (!is_decomposed)
{
const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
is_decomposed = !decomposed_vector.empty();
- if(!is_decomposed)
+ if (!is_decomposed)
{
++pad;
}
@@ -104,17 +104,31 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
{
}
-void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+ enable_fast_math);
}
-void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_UNUSED(enable_fast_math);
- ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math));
+ ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(),
+ biases != nullptr ? biases->info() : nullptr,
+ output->info(), conv_info, act_info, enable_fast_math));
ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_original_weights = weights;
@@ -124,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_has_bias = biases != nullptr;
// Get indices for the width and height
- const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
// Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
- const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
- const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
- pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ const Size2D input_dims =
+ Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size =
+ Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
// Tensors to use
ICLTensor *input_to_use = input;
const ICLTensor *weights_to_use = weights;
ICLTensor *output_to_use = _has_bias ? &_bias_output : output;
// Permute bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
_permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
_permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -146,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
// Permute input if needed
_needs_permute = input->info()->data_layout() == DataLayout::NHWC;
- if(_needs_permute)
+ if (_needs_permute)
{
_memory_group.manage(&_permuted_input);
// Configure the function to transform the input tensor from NHWC -> NCHW
@@ -167,7 +184,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
// Pad weights
- const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
_pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
// Transform weights
@@ -175,10 +192,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
// Pad input
- const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
_memory_group.manage(&_padded_input);
_pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
- if(_needs_permute)
+ if (_needs_permute)
{
_permuted_input.allocator()->allocate();
}
@@ -202,7 +219,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_memory_group.manage(&_itransformed_output);
FFT2DInfo itranform_info;
itranform_info.direction = FFTDirection::Inverse;
- _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransformed_output.allocator()->init(
+ _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
_itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
_output_reduced.allocator()->allocate();
@@ -214,25 +232,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
// Extract correct region
const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
- const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
- const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
- if(_has_bias)
+ const int end_right =
+ _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton =
+ _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if (_has_bias)
{
_memory_group.manage(&_bias_output);
}
- else if(_needs_permute)
+ else if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
}
- _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use,
+ Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
_itransformed_output.allocator()->allocate();
// Add bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
output_to_use = output;
- if(_needs_permute)
+ if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
@@ -243,7 +264,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
}
// Permute output
- if(_needs_permute)
+ if (_needs_permute)
{
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -255,7 +276,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
// Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.configure(compile_context, output, nullptr, act_info);
}
@@ -269,8 +290,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_flip_axis.unmap();
}
-Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLFFTConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math);
@@ -287,24 +313,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
const auto strides = conv_info.stride();
ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+ conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+ conv_info.pad_bottom() != (kernel_size.y() / 2));
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x());
}
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+ (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
// Validate Activation Layer
- if(act_info.enabled())
+ if (act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
}
@@ -320,7 +349,7 @@ void CLFFTConvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Transform input
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_input_func.run();
}
@@ -336,17 +365,17 @@ void CLFFTConvolutionLayer::run()
_reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
_extract_output_func.run();
// Add bias
- if(_has_bias)
+ if (_has_bias)
{
_bias_add_func.run();
}
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_output_func.run();
}
// Run activation layer
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.run();
}
@@ -354,10 +383,10 @@ void CLFFTConvolutionLayer::run()
void CLFFTConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Permute bias to NCHW
- if(_original_bias != nullptr)
+ if (_original_bias != nullptr)
{
_permuted_bias.allocator()->allocate();
_permute_bias_func.run();
@@ -366,7 +395,7 @@ void CLFFTConvolutionLayer::prepare()
const ICLTensor *cur_weights = _original_weights;
// Permute weights
- if(_needs_permute)
+ if (_needs_permute)
{
ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6019a84aba..9bd96a975e 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClFill.h"
@@ -36,16 +37,15 @@ namespace arm_compute
{
struct CLFill::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClFill> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClFill> op{nullptr};
};
-CLFill::CLFill()
- : _impl(std::make_unique<Impl>())
+CLFill::CLFill() : _impl(std::make_unique<Impl>())
{
}
-CLFill::CLFill(CLFill &&) = default;
+CLFill::CLFill(CLFill &&) = default;
CLFill &CLFill::operator=(CLFill &&) = default;
CLFill::~CLFill() = default;
@@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind
configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window);
}
-void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window)
+void CLFill::configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ const PixelValue &constant_value,
+ Window *dst_window)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 32fc37552c..ba1b5372d3 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -26,8 +26,9 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/gpu/cl/operators/ClFlatten.h"
@@ -36,16 +37,15 @@ namespace arm_compute
{
struct CLFlattenLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClFlatten> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClFlatten> op{nullptr};
};
-CLFlattenLayer::CLFlattenLayer()
- : _impl(std::make_unique<Impl>())
+CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>())
{
}
-CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
+CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default;
CLFlattenLayer::~CLFlattenLayer() = default;
@@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_impl->src = input;
_impl->dst = output;
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_flatten_shape(input->info())));
_impl->op = std::make_unique<opencl::ClFlatten>();
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
@@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+ const TensorInfo tensor_info_output =
+ input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
}
return opencl::ClFlatten::validate(input, output);
@@ -83,4 +85,4 @@ void CLFlattenLayer::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 8739e1803e..4322219dd9 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClFloor.h"
@@ -34,16 +35,15 @@ namespace arm_compute
{
struct CLFloor::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClFloor> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClFloor> op{nullptr};
};
-CLFloor::CLFloor()
- : _impl(std::make_unique<Impl>())
+CLFloor::CLFloor() : _impl(std::make_unique<Impl>())
{
}
-CLFloor::CLFloor(CLFloor &&) = default;
+CLFloor::CLFloor(CLFloor &&) = default;
CLFloor &CLFloor::operator=(CLFloor &&) = default;
CLFloor::~CLFloor() = default;
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 1c162db79a..b30f9e701f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClFullyConnected.h"
@@ -35,21 +36,22 @@ using namespace arm_compute::experimental;
struct CLFullyConnectedLayer::Impl
{
MemoryGroup memory_group{};
- IWeightsManager *weights_manager{ nullptr };
+ IWeightsManager *weights_manager{nullptr};
- std::unique_ptr<opencl::ClFullyConnected> op{ nullptr };
+ std::unique_ptr<opencl::ClFullyConnected> op{nullptr};
- const ITensor *original_weights{ nullptr };
+ const ITensor *original_weights{nullptr};
ITensorPack run_pack{};
WorkspaceData<CLTensor> workspace{};
experimental::MemoryRequirements aux_mem_req{};
- bool is_prepared{ false };
- bool dynamic_weights{ false };
+ bool is_prepared{false};
+ bool dynamic_weights{false};
};
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
: _impl(std::make_unique<Impl>())
{
_impl->memory_group = MemoryGroup(std::move(memory_manager));
@@ -58,39 +60,45 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> mem
CLFullyConnectedLayer::~CLFullyConnectedLayer() = default;
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
FullyConnectedLayerInfo fc_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
}
-void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
FullyConnectedLayerInfo fc_info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- fc_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info));
_impl->op = std::make_unique<opencl::ClFullyConnected>();
_impl->original_weights = weights;
_impl->is_prepared = fc_info.retain_internal_weights;
- _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
- if(_impl->weights_manager != nullptr)
+ if (_impl->weights_manager != nullptr)
{
_impl->weights_manager->manage(_impl->original_weights);
}
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
_impl->aux_mem_req = _impl->op->workspace();
- _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
- _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->workspace =
+ manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
else
{
@@ -98,14 +106,14 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c
_impl->run_pack.add_tensor(ACL_DST, output);
}
- _impl->dynamic_weights =
- !weights->info()->are_values_constant() &&
- fc_info.transpose_weights &&
- !fc_info.are_weights_reshaped &&
- !fc_info.retain_internal_weights;
+ _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+ !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
}
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLFullyConnectedLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
FullyConnectedLayerInfo fc_info)
{
return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info);
@@ -113,7 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
void CLFullyConnectedLayer::run()
{
- if(!_impl->dynamic_weights)
+ if (!_impl->dynamic_weights)
{
prepare();
}
@@ -124,7 +132,7 @@ void CLFullyConnectedLayer::run()
void CLFullyConnectedLayer::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
_impl->op->prepare(_impl->run_pack);
@@ -133,13 +141,13 @@ void CLFullyConnectedLayer::prepare()
_impl->is_prepared = true;
// Handle weights managed infrastructure
- if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+ if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
{
// Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
// This is for cases where multiple functions share the same b (weights)
// Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
const ITensor *original_b = _impl->original_weights;
- if(!original_b->is_used())
+ if (!original_b->is_used())
{
_impl->weights_manager->pre_mark_as_unused(original_b);
}
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 7379e9d9fe..e4fbf78e13 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -28,9 +28,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
namespace arm_compute
{
@@ -41,29 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization()
CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
-void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
-void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
- _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+ epsilon, fbn_type);
+ _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
+ bn_beta, bn_gamma, epsilon, fbn_type);
}
-Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
void CLFuseBatchNormalization::run()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 427ea51ab9..871a1d6e27 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClGemm.h"
@@ -40,15 +41,15 @@ using OperatorType = opencl::ClGemm;
struct CLGEMM::Impl
{
- const ICLTensor *b{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ICLTensor *b{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
MemoryGroup memory_group{};
- IWeightsManager *weights_manager{ nullptr };
+ IWeightsManager *weights_manager{nullptr};
ITensorPack run_pack{};
ITensorPack prep_pack{};
MemoryRequirements aux_mem_req{};
WorkspaceData<CLTensor> workspace_tensors{};
- bool is_prepared{ false };
+ bool is_prepared{false};
};
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -60,12 +61,25 @@ CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
CLGEMM::~CLGEMM() = default;
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
}
-void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const CLCompileContext &compile_context,
+ const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
@@ -73,25 +87,33 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor
_impl->op = std::make_unique<OperatorType>();
_impl->is_prepared = gemm_info.retain_internal_weights();
- _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info);
+ _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+ alpha, beta, gemm_info);
_impl->aux_mem_req = _impl->op->workspace();
// Manage/allocate auxilairy tensors
- if(_impl->is_prepared)
+ if (_impl->is_prepared)
{
_impl->run_pack.add_const_tensor(ACL_SRC_0, a);
_impl->run_pack.add_tensor(ACL_DST, output);
}
else
{
- _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } };
- _impl->prep_pack = { { ACL_SRC_1, _impl->b } };
+ _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
- _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
}
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info);
}
@@ -107,15 +129,15 @@ void CLGEMM::run()
void CLGEMM::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
_impl->op->prepare(_impl->prep_pack);
- auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
- _impl->aux_mem_req.end(),
- [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- if(has_reshape != std::end(_impl->aux_mem_req))
+ if (has_reshape != std::end(_impl->aux_mem_req))
{
_impl->b->mark_as_unused();
}
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c8c18f35db..aef7cddd7a 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,10 +27,11 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClGemmConv2d.h"
#include "support/Cast.h"
@@ -47,18 +48,19 @@ using namespace arm_compute::experimental;
struct CLGEMMConvolutionLayer::Impl
{
- const ITensor *weights{ nullptr };
- std::unique_ptr<opencl::ClGemmConv2d> op{ nullptr };
+ const ITensor *weights{nullptr};
+ std::unique_ptr<opencl::ClGemmConv2d> op{nullptr};
ITensorPack run_pack{};
ITensorPack prep_pack{};
MemoryGroup memory_group{};
- IWeightsManager *weights_manager{ nullptr };
+ IWeightsManager *weights_manager{nullptr};
MemoryRequirements aux_mem_req{};
WorkspaceData<CLTensor> workspace_tensors{};
- bool is_prepared{ false };
+ bool is_prepared{false};
};
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
: _impl(std::make_unique<Impl>())
{
_impl->memory_group = MemoryGroup(memory_manager);
@@ -67,40 +69,60 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m
CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ unsigned int num_groups)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+ dilation, act_info, num_groups);
}
-void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
_impl->weights = weights;
_impl->op = std::make_unique<opencl::ClGemmConv2d>();
const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
- _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
- _impl->run_pack =
- {
- { TensorType::ACL_SRC_0, input },
- { TensorType::ACL_SRC_1, weights },
- { TensorType::ACL_SRC_2, biases },
- { TensorType::ACL_DST, output }
- };
- _impl->prep_pack =
- {
- { TensorType::ACL_SRC_1, weights },
- { TensorType::ACL_SRC_2, biases },
+ _impl->run_pack = {{TensorType::ACL_SRC_0, input},
+ {TensorType::ACL_SRC_1, weights},
+ {TensorType::ACL_SRC_2, biases},
+ {TensorType::ACL_DST, output}};
+ _impl->prep_pack = {
+ {TensorType::ACL_SRC_1, weights},
+ {TensorType::ACL_SRC_2, biases},
};
- _impl->aux_mem_req = _impl->op->workspace();
- _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ unsigned int num_groups)
{
const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
@@ -115,14 +137,14 @@ void CLGEMMConvolutionLayer::run()
void CLGEMMConvolutionLayer::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
_impl->op->prepare(_impl->prep_pack);
- auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
- _impl->aux_mem_req.end(),
- [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- if(has_reshape != std::end(_impl->aux_mem_req))
+ if (has_reshape != std::end(_impl->aux_mem_req))
{
_impl->weights->mark_as_unused();
}
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 9fc81c11da..7d40cf1829 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -24,15 +24,15 @@
#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include <tuple>
@@ -40,12 +40,13 @@ namespace arm_compute
{
namespace
{
-std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+std::pair<Coordinates, Coordinates>
+compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
{
Coordinates start;
Coordinates end;
- if(is_nchw)
+ if (is_nchw)
{
start.set(0, deconv_info.pad_left());
start.set(1, deconv_info.pad_top());
@@ -63,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT
end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
}
- return { start, end };
+ return {start, end};
}
-Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ GEMMLowpOutputStageInfo &output_stage_info)
{
const auto data_type = input->data_type();
- if(is_data_type_quantized_asymmetric(data_type))
+ if (is_data_type_quantized_asymmetric(data_type))
{
const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
@@ -78,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn
float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier(0);
int output_shift(0);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -122,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
-Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
DataLayout data_layout = input->data_layout();
- const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+ const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 ||
+ deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
const bool is_nchw = input->data_layout() == DataLayout::NCHW;
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
@@ -144,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
TensorShape nhwc_weights_shape = weights->tensor_shape();
TensorShape nhwc_input_shape = input->tensor_shape();
- if(is_nchw)
+ if (is_nchw)
{
permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
permute(nhwc_input_shape, PermutationVector(2, 0, 1));
- TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+ TensorInfo nhwc_input_info = input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(nhwc_input_shape)
+ .set_data_layout(DataLayout::NCHW);
- TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+ TensorInfo nhwc_weights_info = weights->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(nhwc_weights_shape)
+ .set_data_layout(DataLayout::NCHW);
CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
}
- const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
- const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+ const TensorShape reshaped_shape =
+ TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+ const TensorInfo reshaped_info =
+ weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]);
@@ -166,77 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
- input->dimension(idx_w),
- input->dimension(idx_h),
- input->dimension(idx_b));
+ input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b));
TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true);
GEMMLowpOutputStageInfo output_stage_info;
- if(is_quantized)
+ if (is_quantized)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
- gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+ &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr,
+ &gemm_output_info.set_data_type(DataType::S32), gemm_info));
ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true),
+ &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
}
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
- const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
- TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+ weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+ const TensorShape deconv_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+ TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
- if(padded_input && is_quantized)
+ if (padded_input && is_quantized)
{
const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+ &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(
+ &col2im_output_info, nullptr,
+ &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()),
+ output, start_end.first, start_end.second));
}
- else if(padded_input)
+ else if (padded_input)
{
const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+ &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
}
- else if(is_quantized)
+ else if (is_quantized)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+ &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
}
return Status{};
}
-void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
}
-void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
- const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
- weights->info(),
- bias != nullptr ? bias->info() : nullptr,
- output->info(),
- deconv_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(
+ input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info));
ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info);
_original_weights = weights;
- _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 ||
+ deconv_info.pad_top() > 0;
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
const ICLTensor *input_to_use = input;
const ICLTensor *weights_to_use = weights;
@@ -245,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
// do an outer product in NCHW and then an accumulation through a reduction. This would have two
// drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
// might be slower than GEMM.
- if(_is_nchw)
+ if (_is_nchw)
{
_memory_group.manage(&_permuted_input);
_permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
@@ -257,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
}
// Reshape the input weights. The weights will be reshaped only once during the call to prepare()
- _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
- weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
- 1,
- input->info()->data_type(), weights->info()->quantization_info()));
+ _reshaped_weights.allocator()->init(
+ TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) *
+ weights_to_use->info()->dimension(2) *
+ weights_to_use->info()->dimension(3)),
+ 1, input->info()->data_type(), weights->info()->quantization_info()));
_reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
_transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
@@ -269,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true);
// Configure output stage for asymmetric quantized types
- if(_is_quantized)
+ if (_is_quantized)
{
// gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original
// and restore them back to make it work properly.
QuantizationInfo iq_info = input->info()->quantization_info();
QuantizationInfo wq_info = weights->info()->quantization_info();
- input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
- _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
+ input_to_use->info()->set_quantization_info(
+ QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
+ _reshaped_weights_t.info()->set_quantization_info(
+ QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
_mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
@@ -286,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
}
else
{
- _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+ _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f,
+ gemm_info);
}
- if(_is_nchw)
+ if (_is_nchw)
{
_permuted_input.allocator()->allocate();
}
@@ -298,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
ICLTensor *slice_output = nullptr;
ICLTensor *output_stage_output = nullptr;
- if(_padded_input && _is_quantized)
+ if (_padded_input && _is_quantized)
{
_memory_group.manage(&_slice_gemm_input);
_memory_group.manage(&_gemmlowp_final);
@@ -306,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
output_stage_output = &_slice_gemm_input;
slice_output = output;
}
- else if(_padded_input)
+ else if (_padded_input)
{
_memory_group.manage(&_slice_gemm_input);
deconv_reshape_output = &_slice_gemm_input;
slice_output = output;
}
- else if(_is_quantized)
+ else if (_is_quantized)
{
_memory_group.manage(&_gemmlowp_final);
deconv_reshape_output = &_gemmlowp_final;
@@ -324,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
}
// Configure a Col2Im call to reshape the output of GEMM
- _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+ _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(),
+ weights->info(), deconv_info);
_gemm_output.allocator()->allocate();
- if(_is_quantized)
+ if (_is_quantized)
{
GEMMLowpOutputStageInfo output_stage_info;
construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
- _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+ _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output,
+ output_stage_info);
_gemmlowp_final.allocator()->allocate();
}
// If the input was padded, the output needs to be sliced.
- if(_padded_input)
+ if (_padded_input)
{
- const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+ const auto start_end =
+ compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
_slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
_slice_gemm_input.allocator()->allocate();
}
@@ -350,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_input_to_nhwc.run();
}
- if(_is_quantized)
+ if (_is_quantized)
{
_mm_gemmlowp.run();
}
@@ -366,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run()
CLScheduler::get().enqueue(*_deconv_reshape, false);
- if(_is_quantized)
+ if (_is_quantized)
{
_gemmlowp_output_stage.run();
}
- if(_padded_input)
+ if (_padded_input)
{
_slice_gemm.run();
}
@@ -379,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run()
void CLGEMMDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- if(_is_nchw)
+ if (_is_nchw)
{
_permuted_weights.allocator()->allocate();
_permute_weights_to_nhwc.run();
@@ -392,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare()
_reshaped_weights.allocator()->allocate();
_reshape_weights.run();
- if(_is_nchw)
+ if (_is_nchw)
{
_permuted_weights.allocator()->free();
}
@@ -401,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare()
_transpose_weights.run();
// Prepare gemm
- if(!_is_quantized)
+ if (!_is_quantized)
{
_mm_gemm.prepare();
}
@@ -411,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare()
}
// Free resources
- if(!_reshaped_weights_t.is_used())
+ if (!_reshaped_weights_t.is_used())
{
_reshaped_weights_t.allocator()->free();
}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index d9029478a1..8bad198658 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -31,12 +31,12 @@
#include "arm_compute/core/Log.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/IMemoryManager.h"
-#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
namespace arm_compute
@@ -46,13 +46,13 @@ using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore;
struct CLGEMMLowpMatrixMultiplyCore::Impl
{
- const ICLTensor *b{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ICLTensor *b{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
MemoryGroup memory_group{};
ITensorPack run_pack{};
MemoryRequirements aux_mem_req{};
WorkspaceData<CLTensor> workspace_tensors{};
- bool is_prepared{ false };
+ bool is_prepared{false};
};
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
@@ -63,12 +63,18 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(
+ const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
{
configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
}
-void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+ const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
@@ -76,23 +82,29 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
_impl->op = std::make_unique<OperatorType>();
_impl->is_prepared = gemm_info.retain_internal_weights();
- _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info);
+ _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+ gemm_info);
_impl->aux_mem_req = _impl->op->workspace();
// Manage/allocate auxilairy tensors
- if(_impl->is_prepared)
+ if (_impl->is_prepared)
{
_impl->run_pack.add_const_tensor(ACL_SRC_0, a);
_impl->run_pack.add_tensor(ACL_DST, output);
}
else
{
- _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } };
- _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+ _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
}
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
{
return OperatorType::validate(a, b, c, output, gemm_info);
}
@@ -108,7 +120,7 @@ void CLGEMMLowpMatrixMultiplyCore::run()
void CLGEMMLowpMatrixMultiplyCore::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
_impl->op->prepare(_impl->run_pack);
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 6feed0d713..3dd8c5f101 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -40,27 +40,33 @@ namespace arm_compute
{
struct CLGEMMLowpOutputStage::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *bias{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClGemmLowpOutputStage> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *bias{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr};
ITensorPack run_pack{};
};
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage()
- : _impl(std::make_unique<Impl>())
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
{
}
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default;
-void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const GEMMLowpOutputStageInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
}
-void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const GEMMLowpOutputStageInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -69,11 +75,15 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>();
- _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info);
- _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } };
+ _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(),
+ info);
+ _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}};
}
-Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const GEMMLowpOutputStageInfo &info)
{
return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info);
}
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index 033c117cec..2610cb1a3b 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -24,9 +24,9 @@
#include "arm_compute/runtime/CL/functions/CLGather.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/kernels/CLGatherKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
namespace arm_compute
{
@@ -35,7 +35,11 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe
configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
}
-void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGather::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis)
{
ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
auto k = std::make_unique<CLGatherKernel>();
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 9cb7d618cf..b2c1d2631e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -27,13 +27,13 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
#include "src/core/CL/kernels/CLPadLayerKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -71,48 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
-void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+void CLGenerateProposalsLayer::configure(const ICLTensor *scores,
+ const ICLTensor *deltas,
+ const ICLTensor *anchors,
+ ICLTensor *proposals,
+ ICLTensor *scores_out,
+ ICLTensor *num_valid_proposals,
const GenerateProposalsInfo &info)
{
- configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+ configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out,
+ num_valid_proposals, info);
}
-void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
- ICLTensor *scores_out,
- ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *scores,
+ const ICLTensor *deltas,
+ const ICLTensor *anchors,
+ ICLTensor *proposals,
+ ICLTensor *scores_out,
+ ICLTensor *num_valid_proposals,
+ const GenerateProposalsInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
- ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+ proposals->info(), scores_out->info(),
+ num_valid_proposals->info(), info));
ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
_is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
const DataType scores_data_type = scores->info()->data_type();
_is_qasymm8 = scores_data_type == DataType::QASYMM8;
- const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
- const int total_num_anchors = num_anchors * feat_width * feat_height;
- const int pre_nms_topN = info.pre_nms_topN();
- const int post_nms_topN = info.post_nms_topN();
- const size_t values_per_roi = info.values_per_roi();
+ const int num_anchors = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int pre_nms_topN = info.pre_nms_topN();
+ const int post_nms_topN = info.post_nms_topN();
+ const size_t values_per_roi = info.values_per_roi();
const QuantizationInfo scores_qinfo = scores->info()->quantization_info();
const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
- const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+ const QuantizationInfo rois_qinfo =
+ (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
// Compute all the anchors
_memory_group.manage(&_all_anchors);
- _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+ _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors,
+ ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
- _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+ _deltas_flattened.allocator()->init(
+ TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
// Permute and reshape deltas
_memory_group.manage(&_deltas_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_deltas_permuted);
- _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
_flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
_deltas_permuted.allocator()->allocate();
}
@@ -126,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
// Permute and reshape scores
_memory_group.manage(&_scores_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_scores_permuted);
- _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1});
_flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
_scores_permuted.allocator()->allocate();
}
@@ -140,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
CLTensor *anchors_to_use = &_all_anchors;
CLTensor *deltas_to_use = &_deltas_flattened;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
_deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -163,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
anchors_to_use->allocator()->allocate();
_all_proposals_to_use = &_all_proposals;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_memory_group.manage(&_all_proposals_quantized);
// Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
- _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+ _all_proposals_quantized.allocator()->init(
+ TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
_quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
_all_proposals.allocator()->allocate();
_all_proposals_to_use = &_all_proposals_quantized;
@@ -183,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
// Note that NMS needs outputs preinitialized.
auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
- auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+ auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+ rois_qinfo);
auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
// Initialize temporaries (unused) outputs
@@ -195,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
_num_valid_proposals = num_valid_proposals;
_memory_group.manage(&_proposals_4_roi_values);
- _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
- BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+ _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values,
+ &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+ BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+ true, min_size_scaled, info.im_width(), info.im_height()));
_keeps_nms_unused.allocator()->allocate();
_classes_nms_unused.allocator()->allocate();
_all_proposals_to_use->allocator()->allocate();
_scores_flattened.allocator()->allocate();
// Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
- _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
_proposals_4_roi_values.allocator()->allocate();
}
-Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
- const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores,
+ const ITensorInfo *deltas,
+ const ITensorInfo *anchors,
+ const ITensorInfo *proposals,
+ const ITensorInfo *scores_out,
+ const ITensorInfo *num_valid_proposals,
+ const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -216,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
- const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+ const int num_anchors =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
const int num_images = scores->dimension(3);
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
@@ -227,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
}
- TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
- TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
- TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
- if(scores->data_layout() == DataLayout::NHWC)
+ TensorInfo all_anchors_info(
+ anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(
+ anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+ TensorInfo deltas_permuted_info =
+ deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+ .set_is_resizable(true);
+ TensorInfo scores_permuted_info =
+ scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if (scores->data_layout() == DataLayout::NHWC)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
}
- TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo deltas_flattened_info(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
- TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo scores_flattened_info(
+ scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+ TensorInfo proposals_4_roi_values(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
- TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
- if(is_qasymm8)
+ TensorInfo proposals_4_roi_values_quantized(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+ .set_quantization_info(QuantizationInfo(0.125f, 0));
+ if (is_qasymm8)
{
- TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ TensorInfo all_anchors_f32_info(anchors->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
- TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
- TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+ TensorInfo deltas_flattened_f32_info(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+ TensorInfo proposals_4_roi_values_f32(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(
+ &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
- if(num_valid_proposals->total_size() > 0)
+ if (num_valid_proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
}
- if(proposals->total_size() > 0)
+ if (proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -309,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
}
}
- if(scores_out->total_size() > 0)
+ if (scores_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -356,7 +412,7 @@ void CLGenerateProposalsLayer::run()
CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
// Transpose and reshape the inputs
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_permute_deltas.run();
_permute_scores.run();
@@ -364,7 +420,7 @@ void CLGenerateProposalsLayer::run()
_flatten_deltas.run();
_flatten_scores.run();
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_dequantize_anchors->run();
_dequantize_deltas->run();
@@ -373,7 +429,7 @@ void CLGenerateProposalsLayer::run()
// Build the boxes
CLScheduler::get().enqueue(*_bounding_box_kernel, false);
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_quantize_all_proposals->run();
}
diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
index 90af36aa77..1a2369c5c2 100644
--- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
@@ -26,36 +26,45 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "src/gpu/cl/operators/ClIndirectConv2d.h"
#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
namespace arm_compute
{
struct CLIndirectConvolutionLayer::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *weights{ nullptr };
- const ICLTensor *biases{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClIndirectConv2d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr};
};
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer()
- : _impl(std::make_unique<Impl>())
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>())
{
}
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default;
CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default;
CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer() = default;
-void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
}
-void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -65,10 +74,15 @@ void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_conte
_impl->biases = biases;
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClIndirectConv2d>();
- _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
}
-Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -83,4 +97,4 @@ void CLIndirectConvolutionLayer::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 5feafe19db..0e994e1aee 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -27,50 +27,62 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT
- : _inst_norm_kernel(),
- _mean_var_kernel(),
- _mean_var_tensor(),
- _ctx(ctx)
+ : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx)
{
}
CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer()
{
}
-void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(
+ ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
}
-void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision)
{
ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision);
auto w = std::make_unique<CLComputeMeanVariance>();
w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision);
_mean_var_kernel = std::move(w);
auto k = std::make_unique<CLInstanceNormalizationLayerKernel>();
- k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+ k->configure(compile_context, input, &_mean_var_tensor, output,
+ InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
_inst_norm_kernel = std::move(k);
_mean_var_tensor.allocator()->allocate();
}
-Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision)
{
- return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+ return CLInstanceNormalizationLayerKernel::validate(
+ input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
}
void CLInstanceNormalizationLayer::run()
{
- ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured");
+ ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel,
+ "The child class didn't set the CL kernel or function isn't configured");
schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get());
schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get());
}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 1278385f53..4fe1d9b20b 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -29,12 +29,12 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
#include "src/core/CL/kernels/CLReductionOperationKernel.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
namespace
@@ -57,7 +57,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
}
-void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayer::configure(
+ const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
{
ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
@@ -86,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
sum_sq.set_tensor_shape(shape);
const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
// Reduce shape on axis
shape.set(actual_axis, 1);
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index ea08beca75..3b50234c77 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -24,15 +24,15 @@
#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/gpu/cl/kernels/ClTransposeKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
namespace arm_compute
{
@@ -40,54 +40,155 @@ using namespace arm_compute::misc::shape_calculator;
using namespace arm_compute::utils::info_helpers;
CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
- _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
- _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
- _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
- _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(),
- _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(),
- _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(),
- _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(),
- _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(),
- _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(),
- _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false),
- _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+ : _memory_group(std::move(memory_manager)),
+ _fully_connected_input_gate(),
+ _accum_input_gate1(),
+ _subtract_input_gate(),
+ _pixelwise_mul_input_gate(),
+ _activation_input_gate(),
+ _fully_connected_forget_gate(),
+ _accum_forget_gate1(),
+ _pixelwise_mul_forget_gate(),
+ _activation_forget_gate(),
+ _fully_connected_cell_state(),
+ _gemm_cell_state1(),
+ _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()),
+ _accum_cell_state1(),
+ _accum_cell_state2(),
+ _pixelwise_mul_cell_state1(),
+ _activation_cell_state(),
+ _cell_clip(),
+ _pixelwise_mul_cell_state2(),
+ _fully_connected_output(),
+ _pixelwise_mul_output_state1(),
+ _accum_output1(),
+ _activation_output(),
+ _activation_output_state(),
+ _pixelwise_mul_output_state2(),
+ _fully_connected_output_state(),
+ _projection_clip(),
+ _copy_cell_state(),
+ _copy_output(),
+ _concat_scratch_buffer(),
+ _concat_inputs_forget_gate(),
+ _concat_weights_forget_gate(),
+ _concat_weights_input_gate(),
+ _concat_weights_output(),
+ _ones_fill(),
+ _mean_std_norm_input_gate(),
+ _pixelwise_mul_input_gate_coeff(),
+ _accum_input_gate_bias(),
+ _mean_std_norm_forget_gate(),
+ _pixelwise_mul_forget_gate_coeff(),
+ _accum_forget_gate_bias(),
+ _mean_std_norm_cell_gate(),
+ _pixelwise_mul_cell_gate_coeff(),
+ _accum_cell_gate_bias(),
+ _mean_std_norm_output_gate(),
+ _pixelwise_mul_output_gate_coeff(),
+ _accum_output_gate_bias(),
+ _input_gate_out1(),
+ _input_gate_out2(),
+ _input_gate_out3(),
+ _input_gate_out4(),
+ _forget_gate_out1(),
+ _forget_gate_out2(),
+ _forget_gate_out3(),
+ _forget_gate_out4(),
+ _forget_gate_out5(),
+ _forget_gate_out6(),
+ _cell_state_out1(),
+ _cell_state_out2(),
+ _cell_state_out3(),
+ _cell_state_out4(),
+ _cell_state_out5(),
+ _output1(),
+ _output2(),
+ _output3(),
+ _output4(),
+ _cell_state_activation(),
+ _output_state1(),
+ _ones(),
+ _input_layer_norm_out1(),
+ _input_layer_norm_out2(),
+ _forget_layer_norm_out1(),
+ _forget_layer_norm_out2(),
+ _cell_layer_norm_out1(),
+ _cell_layer_norm_out2(),
+ _output_layer_norm_out1(),
+ _output_layer_norm_out2(),
+ _run_peephole_opt(false),
+ _run_cifg_opt(false),
+ _perform_cell_clipping(false),
+ _has_projection_weights(false),
+ _perform_projection_clipping(false),
+ _is_prepared(false),
+ _is_layer_norm_lstm(false)
{
}
CLLSTMLayer::~CLLSTMLayer() = default;
-void CLLSTMLayer::configure(const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- const ICLTensor *output_state_in, ICLTensor *cell_state_in,
- ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
- const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_in,
+ ICLTensor *scratch_buffer,
+ ICLTensor *output_state_out,
+ ICLTensor *cell_state_out,
+ ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
- recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in,
+ cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
cell_threshold, projection_threshold);
}
-void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- const ICLTensor *output_state_in, ICLTensor *cell_state_in,
- ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
- const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_in,
+ ICLTensor *scratch_buffer,
+ ICLTensor *output_state_out,
+ ICLTensor *cell_state_out,
+ ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
scratch_buffer, output_state_out, cell_state_out, output);
- ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
- recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out,
- output, lstm_params, activation_info, cell_threshold, projection_threshold);
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+ scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+ cell_threshold, projection_threshold);
_is_layer_norm_lstm = lstm_params.use_layer_norm();
@@ -96,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
// Validate
- ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
- input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- output_state_in->info(), cell_state_in->info(),
- scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
- lstm_params_info, activation_info, cell_threshold, projection_threshold));
+ ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+ cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+ lstm_params_info, activation_info, cell_threshold, projection_threshold));
const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
// Configure block that calculates the forget gate
@@ -126,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
weights_vector.emplace_back(input_to_forget_weights);
weights_vector.emplace_back(recurrent_to_forget_weights);
- const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+ const TensorShape weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
_forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
_memory_group.manage(&_forget_gate_out5);
- _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+ _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6,
+ (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
_memory_group.manage(&_forget_gate_out1);
_memory_group.manage(&_forget_gate_out3);
_forget_gate_out6.allocator()->allocate();
CLTensor *forget_gate_out = &_forget_gate_out5;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_run_peephole_opt = true;
_memory_group.manage(&_forget_gate_out4);
- _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+ &_forget_gate_out4, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+ ConvertPolicy::SATURATE);
_forget_gate_out4.allocator()->allocate();
_forget_gate_out5.allocator()->allocate();
forget_gate_out = &_forget_gate_out3;
@@ -154,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
{
_forget_gate_out3.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_forget_layer_norm_out1);
_memory_group.manage(&_forget_layer_norm_out2);
_mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
- _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out,
+ lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
forget_gate_out->allocator()->allocate();
- _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias,
+ &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
_forget_layer_norm_out1.allocator()->allocate();
forget_gate_out = &_forget_layer_norm_out2;
}
- _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -178,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
// input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
CLTensor *input_gate_out = &_input_gate_out1;
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
- _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1,
+ ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -195,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
std::vector<const ICLTensor *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorShape lstm_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
_input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
@@ -203,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_memory_group.manage(&_input_gate_out1);
_memory_group.manage(&_input_gate_out3);
- _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+ _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2,
+ (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+ &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
input_gate_out = &_input_gate_out3;
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out4);
- _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+ &_input_gate_out4, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+ ConvertPolicy::SATURATE);
_input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
input_gate_out = &_input_gate_out1;
@@ -221,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_input_gate_out1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_input_layer_norm_out1);
_memory_group.manage(&_input_layer_norm_out2);
_mean_std_norm_input_gate.configure(compile_context, input_gate_out);
- _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out,
+ lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
input_gate_out->allocator()->allocate();
- _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(),
+ &_input_layer_norm_out2, ConvertPolicy::SATURATE);
_input_layer_norm_out1.allocator()->allocate();
input_gate_out = &_input_layer_norm_out2;
}
- _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_input_gate.configure(compile_context, input_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -249,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_state_out1);
- _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+ _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights,
+ (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
_memory_group.manage(&_cell_state_out2);
_transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info());
_recurrent_to_cell_weights = recurrent_to_cell_weights;
_memory_group.manage(&_cell_state_out3);
- _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+ _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f,
+ 0.f);
_cell_state_out2.allocator()->allocate();
_memory_group.manage(&_cell_state_out4);
- _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4,
+ ConvertPolicy::SATURATE);
CLTensor *cell_state_out_ptr = &_cell_state_out4;
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_layer_norm_out1);
_memory_group.manage(&_cell_layer_norm_out2);
_mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
- _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr,
+ lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
cell_state_out_ptr->allocator()->allocate();
- _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_cell_layer_norm_out1.allocator()->allocate();
cell_state_out_ptr = &_cell_layer_norm_out2;
}
_activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
cell_state_out_ptr->allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1,
+ ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
- if(cell_threshold != 0.f)
+ if (cell_threshold != 0.f)
{
_perform_cell_clipping = true;
- _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+ _cell_clip.configure(compile_context, &_cell_state_out1, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold));
}
// Configure block that calculates the output
@@ -298,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
std::vector<const ICLTensor *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorShape in_out_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
_output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
@@ -306,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_memory_group.manage(&_output1);
_memory_group.manage(&_output4);
- _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+ _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2,
+ (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
_output2.allocator()->allocate();
_forget_gate_out2.allocator()->allocate();
CLTensor *output_gate_out = &_output4;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
_memory_group.manage(&_output3);
- _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(),
+ &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
_output4.allocator()->allocate();
output_gate_out = &_output1;
@@ -329,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
{
_output1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_output_layer_norm_out1);
_memory_group.manage(&_output_layer_norm_out2);
_mean_std_norm_output_gate.configure(compile_context, output_gate_out);
- _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out,
+ lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
output_gate_out->allocator()->allocate();
- _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias,
+ &_output_layer_norm_out2, ConvertPolicy::SATURATE);
_output_layer_norm_out1.allocator()->allocate();
output_gate_out = &_output_layer_norm_out2;
}
- _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_output.configure(compile_context, output_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -361,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_memory_group.manage(&_cell_state_activation);
_activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
- _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out,
+ output_state_out_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
_cell_state_activation.allocator()->allocate();
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
_has_projection_weights = true;
- _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+ _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out);
_output_state1.allocator()->allocate();
// Perform clipping
- if(projection_threshold != 0.f)
+ if (projection_threshold != 0.f)
{
_perform_projection_clipping = true;
- _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ _projection_clip.configure(compile_context, output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -projection_threshold, projection_threshold));
}
}
@@ -383,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
// Vector for holding the tensors to store in scratch buffer
std::vector<const ICLTensor *> scratch_inputs;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
scratch_inputs.emplace_back(input_gate_out);
}
@@ -397,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
output_gate_out->allocator()->allocate();
}
-Status CLLSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
- const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
- const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status CLLSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *scratch_buffer,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check data types
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check dimensions
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -438,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
- && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+ cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
const unsigned int num_batches = input->dimension(1);
const unsigned int num_cells = input_to_output_weights->dimension(1);
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
// If CIFG is used, input layer normalization weights tensor is omitted
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
}
@@ -459,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
}
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -470,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
}
// Check peephole optimization
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -488,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
// Validate forget gate
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
std::vector<const ITensorInfo *> inputs_vector;
inputs_vector.emplace_back(input);
inputs_vector.emplace_back(output_state_in);
- const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate input gate
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
- lstm_params.recurrent_to_input_weights(),
- lstm_params.input_gate_bias());
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -525,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
- TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+ TensorShape lstm_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, lstm_params.input_to_input_weights(),
+ (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+ &input_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
// Validate cell state
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(lstm_params.use_layer_norm())
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(cell_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (cell_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
- -cell_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&cell_state_tmp, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold)));
}
std::vector<const ITensorInfo *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
- TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+ TensorShape in_out_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
// Validate output gate tmp
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate output state
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- if(lstm_params.has_projection())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
- if(projection_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp,
+ 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN));
+ if (lstm_params.has_projection())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out));
+ if (projection_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out,
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ output_state_out, output_state_out,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+ projection_threshold)));
}
}
@@ -616,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
// Validate scratch concatenation
std::vector<const ITensorInfo *> inputs_vector_info_raw;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
inputs_vector_info_raw.push_back(&input_gate);
}
@@ -638,12 +829,12 @@ void CLLSTMLayer::run()
_fully_connected_forget_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_forget_gate.run();
_accum_forget_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_forget_gate.run();
_pixelwise_mul_forget_gate_coeff.run();
@@ -651,7 +842,7 @@ void CLLSTMLayer::run()
}
_activation_forget_gate.run();
- if(_run_cifg_opt)
+ if (_run_cifg_opt)
{
_ones_fill.run();
_subtract_input_gate.run();
@@ -660,13 +851,13 @@ void CLLSTMLayer::run()
{
_fully_connected_input_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_input_gate.run();
_accum_input_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_input_gate.run();
_pixelwise_mul_input_gate_coeff.run();
@@ -679,12 +870,10 @@ void CLLSTMLayer::run()
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights);
pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2);
- CLScheduler::get().enqueue_op(*_transpose_cell_state,
- pack,
- false);
+ CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false);
_gemm_cell_state1.run();
_accum_cell_state1.run();
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_cell_gate.run();
_pixelwise_mul_cell_gate_coeff.run();
@@ -695,19 +884,19 @@ void CLLSTMLayer::run()
_pixelwise_mul_cell_state2.run();
_accum_cell_state2.run();
- if(_perform_cell_clipping)
+ if (_perform_cell_clipping)
{
_cell_clip.run();
}
_fully_connected_output.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_output_state1.run();
_accum_output1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_output_gate.run();
_pixelwise_mul_output_gate_coeff.run();
@@ -718,10 +907,10 @@ void CLLSTMLayer::run()
_activation_output_state.run();
_pixelwise_mul_output_state2.run();
- if(_has_projection_weights)
+ if (_has_projection_weights)
{
_fully_connected_output_state.run();
- if(_perform_projection_clipping)
+ if (_perform_projection_clipping)
{
_projection_clip.run();
}
@@ -735,10 +924,10 @@ void CLLSTMLayer::run()
void CLLSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_concat_weights_forget_gate.run();
- if(!_run_cifg_opt)
+ if (!_run_cifg_opt)
{
_concat_weights_input_gate.run();
}
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index d14c6102d5..ea64eda023 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -25,12 +25,12 @@
#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/core/Validate.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
#include <memory>
@@ -46,48 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit
} // namespace
CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
- _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
- _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
- _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
- _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
- _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
- _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
- _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+ : _memory_group(std::move(memory_manager)),
+ _gemmlowp(),
+ _output_stage(),
+ _transpose_weights(),
+ _concat_input_weights(),
+ _concat_recurrent_weights(),
+ _concat_weights(),
+ _concat_inputs(),
+ _concat_bias(),
+ _sigmoid_forget_gate(),
+ _sigmoid_input_gate(),
+ _sigmoid_output_gate(),
+ _tanh_modulation_gate(),
+ _tanh_output_state(),
+ _add_cell_state_tmps(),
+ _add2(),
+ _mul_forget_gate_cell_state(),
+ _mul_input_gate_input_mod_gate(),
+ _mul_output_state_tmp_output_gate(),
+ _slice_input_tensor(),
+ _slice_forget_tensor(),
+ _slice_cell_tensor(),
+ _slice_output_tensor(),
+ _dequantize(),
+ _quantize(),
+ _input_to_input_weights(nullptr),
+ _input_to_forget_weights(nullptr),
+ _input_to_cell_weights(nullptr),
+ _input_to_output_weights(nullptr),
+ _recurrent_to_input_weights(nullptr),
+ _recurrent_to_forget_weights(nullptr),
+ _recurrent_to_cell_weights(nullptr),
+ _recurrent_to_output_weights(nullptr),
+ _input_gate_bias(nullptr),
+ _forget_gate_bias(nullptr),
+ _cell_bias(nullptr),
+ _output_gate_bias(nullptr),
+ _recurrent_weights(),
+ _input_weights(),
+ _weights(),
+ _input(),
+ _weights_transposed(),
+ _output_highp(),
+ _output_lowp(),
+ _bias(),
+ _forget_gate_input(),
+ _input_gate_input(),
+ _output_gate_input(),
+ _input_modulation_gate_input(),
+ _forget_gate_output(),
+ _input_gate_output(),
+ _output_gate_output(),
+ _input_modulation_gate_output(),
+ _cell_state_tmp1(),
+ _cell_state_tmp2(),
+ _output_state_tmp(),
+ _output_state_out_symm(),
+ _output_state_out_f32(),
+ _is_prepared(false)
{
}
void CLLSTMLayerQuantized::configure(const ICLTensor *input,
- const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, const ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out)
+ const ICLTensor *input_to_input_weights,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
- output_state_out);
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
}
-void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, const ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out)
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_input_weights,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
- ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
+
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+ cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
output_state_out);
- ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
- input_to_output_weights->info(),
- recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(
+ input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+ recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+ output_state_in->info(), cell_state_out->info(), output_state_out->info()));
const int input_size = input->info()->dimension(0);
const int batch_size = input->info()->dimension(1);
@@ -95,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
- auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
- auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+ auto_init_if_empty(*cell_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+ auto_init_if_empty(*output_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
_input_to_input_weights = input_to_input_weights;
_input_to_forget_weights = input_to_forget_weights;
@@ -124,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
- _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _input_weights.allocator()->init(
+ TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
- _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _recurrent_weights.allocator()->init(
+ TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
std::vector<const ICLTensor *> weights_vector;
weights_vector.emplace_back(&_recurrent_weights);
weights_vector.emplace_back(&_input_weights);
- _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _weights.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
_transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
@@ -144,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
input_vector.emplace_back(output_state_in);
_memory_group.manage(&_input);
- _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+ _input.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
_concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
// Bias concatenation
@@ -159,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
// Invert the offset for gemmlowp
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
// Run gemmlowp
_memory_group.manage(&_output_highp);
@@ -169,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
// Set the offset back
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
// multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
_output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -191,85 +280,111 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
_bias.allocator()->allocate();
// Get the gate tensors
- if(batch_size > 1)
+ if (batch_size > 1)
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+ _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0},
+ {output_size, batch_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+ _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0},
+ {2 * output_size, batch_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+ _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input,
+ {2 * output_size, 0}, {3 * output_size, batch_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+ _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0},
+ {4 * output_size, batch_size});
_output_lowp.allocator()->allocate();
}
else
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
+ _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+ _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size},
+ {2 * output_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+ _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+ {3 * output_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+ _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size},
+ {4 * output_size});
_output_lowp.allocator()->allocate();
}
// Forget gate
_memory_group.manage(&_forget_gate_output);
- _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_output.allocator()->init(
+ TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_forget_gate_input.allocator()->allocate();
// Input gate
_memory_group.manage(&_input_gate_output);
- _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_output.allocator()->init(
+ TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_input_gate_input.allocator()->allocate();
// Input modulation gate equation
_memory_group.manage(&_input_modulation_gate_output);
- _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _input_modulation_gate_output.allocator()->init(
+ TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_input_modulation_gate_input.allocator()->allocate();
// Output gate
_memory_group.manage(&_output_gate_output);
- _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_output.allocator()->init(
+ TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_output_gate_input.allocator()->allocate();
// Long term memory
_memory_group.manage(&_cell_state_tmp1);
- _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state_tmp1.allocator()->init(
+ TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_forget_gate_output.allocator()->allocate();
_memory_group.manage(&_cell_state_tmp2);
- _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state_tmp2.allocator()->init(
+ TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output,
+ &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_input_modulation_gate_output.allocator()->allocate();
_input_gate_output.allocator()->allocate();
- _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+ _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out,
+ ConvertPolicy::SATURATE);
_cell_state_tmp1.allocator()->allocate();
_cell_state_tmp2.allocator()->allocate();
// Short term memory
_memory_group.manage(&_output_state_tmp);
- _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _output_state_tmp.allocator()->init(
+ TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_memory_group.manage(&_output_state_out_symm);
- _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _output_state_out_symm.allocator()->init(
+ TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output,
+ &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_output_gate_output.allocator()->allocate();
_output_state_tmp.allocator()->allocate();
// Requantize the output state from QSYMM16 to QASYMM8
_memory_group.manage(&_output_state_out_f32);
- _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+ _output_state_out_f32.allocator()->init(
+ TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
_dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
_output_state_out_symm.allocator()->allocate();
@@ -278,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
}
Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+ const ITensorInfo *input_to_input_weights,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_input_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *input_gate_bias,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
- output_state_in, cell_state_out, output_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
const int input_size = input->dimension(0);
@@ -299,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
- TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
- TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
- TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+ TensorInfo input_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(input_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(output_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo bias_info(
+ input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+ TensorInfo output_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QASYMM8)
+ .set_quantization_info(qasymm));
+ TensorInfo cell_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QSYMM16)
+ .set_quantization_info(qsymm_4));
// Shape checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
// Data type checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+ input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
// Quantization checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
@@ -343,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
- ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
// _concat_weights
std::vector<const ITensorInfo *> weights_vector;
@@ -353,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
// _transpose_weights
const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
- TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+ TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
// _concat_inputs
@@ -379,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
// _gemmlowp
const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
// Set the offset back
input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -390,7 +542,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
int output_multiplier = 0;
int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
// _output_stage
GEMMLowpOutputStageInfo info{};
@@ -405,68 +558,91 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
TensorInfo input_modulation_gate_input;
TensorInfo output_gate_input;
- if(batch_size > 1)
+ if (batch_size > 1)
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+ {3 * output_size, batch_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
}
else
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
}
// _sigmoid_forget_gate
const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _sigmoid_input_gate
const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _tanh_modulation_gate
- const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+ qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _sigmoid_output_gate
const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&output_gate_input, &output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _mul_forget_gate_cell_state
const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
// _mul_input_gate_input_mod_gate
const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+ &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _add_cell_state_tmps
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
// _tanh_modulation_gate
const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(cell_state_out, &output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _mul_output_state_tmp_output_gate
const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+ &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _dequantize
const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -475,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
// _quantize
ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
- if(cell_state_out->total_size() != 0)
+ if (cell_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
}
- if(output_state_out->total_size() != 0)
+ if (output_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -541,7 +717,7 @@ void CLLSTMLayerQuantized::run()
void CLLSTMLayerQuantized::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_input_weights.allocator()->allocate();
_concat_input_weights.run();
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index 696191c485..ea21c54bc3 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
#include <utility>
@@ -33,7 +34,10 @@ namespace arm_compute
{
namespace experimental
{
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+ ITensorInfo *input1,
+ ITensorInfo *input2,
+ ITensorInfo *output)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors)
struct CLLogicalAnd::Impl
{
- const ICLTensor *src0{ nullptr };
- const ICLTensor *src1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr };
+ const ICLTensor *src0{nullptr};
+ const ICLTensor *src1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<experimental::CLLogicalAnd> op{nullptr};
};
-CLLogicalAnd::CLLogicalAnd()
- : _impl(std::make_unique<Impl>())
+CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>())
{
}
-CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
CLLogicalAnd::~CLLogicalAnd() = default;
@@ -73,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output)
{
_impl->src0 = input1;
_impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
index a0504d7852..71f9cce54f 100644
--- a/src/runtime/CL/functions/CLLogicalNot.cpp
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClLogicalNot.h"
@@ -32,16 +33,15 @@ namespace arm_compute
{
struct CLLogicalNot::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClLogicalNot> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClLogicalNot> op{nullptr};
};
-CLLogicalNot::CLLogicalNot()
- : _impl(std::make_unique<Impl>())
+CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>())
{
}
-CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
CLLogicalNot::~CLLogicalNot() = default;
@@ -72,4 +72,4 @@ void CLLogicalNot::run()
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index f9a606e8a5..3db4fdae84 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
#include <utility>
@@ -33,7 +34,10 @@ namespace arm_compute
{
namespace experimental
{
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+ ITensorInfo *input1,
+ ITensorInfo *input2,
+ ITensorInfo *output)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors)
struct CLLogicalOr::Impl
{
- const ICLTensor *src0{ nullptr };
- const ICLTensor *src1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<experimental::CLLogicalOr> op{ nullptr };
+ const ICLTensor *src0{nullptr};
+ const ICLTensor *src1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<experimental::CLLogicalOr> op{nullptr};
};
-CLLogicalOr::CLLogicalOr()
- : _impl(std::make_unique<Impl>())
+CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>())
{
}
-CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
CLLogicalOr::~CLLogicalOr() = default;
@@ -73,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output)
{
_impl->src0 = input1;
_impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index bef422fca1..e8bdad706b 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTypes.h"
+
#include "src/gpu/cl/operators/ClMatMul.h"
namespace arm_compute
@@ -32,23 +34,32 @@ using OperatorType = opencl::ClMatMul;
struct CLMatMul::Impl
{
- std::unique_ptr<OperatorType> op{ nullptr };
+ std::unique_ptr<OperatorType> op{nullptr};
ITensorPack run_pack{};
};
-CLMatMul::CLMatMul()
- : _impl(std::make_unique<Impl>())
+CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>())
{
}
CLMatMul::~CLMatMul() = default;
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CLMatMul::configure(ICLTensor *lhs,
+ ICLTensor *rhs,
+ ICLTensor *output,
+ const MatMulInfo &matmul_info,
+ const GpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(settings);
configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
}
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+void CLMatMul::configure(const CLCompileContext &compile_context,
+ ICLTensor *lhs,
+ ICLTensor *rhs,
+ ICLTensor *output,
+ const MatMulInfo &matmul_info,
+ const GpuMatMulSettings &settings,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
@@ -56,10 +67,14 @@ void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs
_impl->op = std::make_unique<OperatorType>();
_impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
- _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+ _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
}
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status CLMatMul::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *output,
+ const MatMulInfo &matmul_info,
+ const ActivationLayerInfo &act_info)
{
return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
}
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 2786d32d33..7494f379b9 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -27,26 +27,32 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
namespace arm_compute
{
CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
- : _fill(),
- _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
+ : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
{
}
CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
-void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(ICLTensor *input,
+ ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
}
-void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
const PixelValue zero_value(0.f);
@@ -55,7 +61,10 @@ void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICL
_unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
}
-Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info)
{
return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
}
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index a81cbca1b0..5892c0e840 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -24,9 +24,9 @@
#include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
namespace arm_compute
{
@@ -35,7 +35,10 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp
configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
}
-void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float epsilon)
{
ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
auto k = std::make_unique<CLMeanStdDevNormalizationKernel>();
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index c0cc5184e6..f93f82f1a2 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -30,10 +30,10 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
namespace arm_compute
{
@@ -50,7 +50,10 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const
configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
}
-void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+void CLNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const NormalizationLayerInfo &norm_info)
{
ARM_COMPUTE_ERROR_ON(input == nullptr);
ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
@@ -58,21 +61,24 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC
// Configure normalization kernel
_norm_kernel->configure(compile_context, input, output, norm_info);
- if(!_norm_kernel->border_size().empty())
+ if (!_norm_kernel->border_size().empty())
{
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+ _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT,
+ PixelValue());
}
}
-Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status CLNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
{
return CLNormalizationLayerKernel::validate(input, output, norm_info);
}
void CLNormalizationLayer::run()
{
- if(!_norm_kernel->border_size().empty())
+ if (!_norm_kernel->border_size().empty())
{
// Run border handler
CLScheduler::get().enqueue(*_border_handler, false);
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 63c9164a94..939c95bd45 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -24,20 +24,26 @@
#include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
-#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
#include <utility>
namespace arm_compute
{
-void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
}
-void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
ARM_COMPUTE_LOG_PARAMS(input, output, mean, std);
auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>();
@@ -45,8 +51,10 @@ void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_contex
_kernel = std::move(k);
}
-Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *std)
{
return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
}
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 186e7b4ba2..ce6d285ebe 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/gpu/cl/IClKernel.h"
#include "src/gpu/cl/operators/ClPRelu.h"
@@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu;
struct CLPReluLayer::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
};
-CLPReluLayer::CLPReluLayer()
- : _impl(std::make_unique<Impl>())
+CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>())
{
}
-CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
CLPReluLayer::~CLPReluLayer() = default;
@@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp
configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
}
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *alpha,
+ ICLTensor *output)
{
_impl->src_0 = input;
_impl->src_1 = alpha;
_impl->dst = output;
_impl->op = std::make_unique<OperatorType>();
- _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info()));
+ _impl->op->configure(compile_context, input->info(), alpha->info(),
+ (output == nullptr ? input->info() : output->info()));
}
Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 0ed8f03d64..e788ded512 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -22,37 +22,38 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
-#include "src/core/CL/kernels/CLPadLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
namespace arm_compute
{
-CLPadLayer::CLPadLayer()
- : _pad_kernel(std::make_unique<CLPadLayerKernel>()),
- _copy(),
- _perform_pad(false)
+CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false)
{
}
CLPadLayer::~CLPadLayer() = default;
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(
+ ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
}
-void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
- _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
- {
- return info.first > 0 || info.second > 0;
- });
+ _perform_pad =
+ std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
- if(_perform_pad)
+ if (_perform_pad)
{
_pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
}
@@ -62,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i
_copy.configure(compile_context, input, output);
}
}
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
- bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
- {
- return info.first > 0 || info.second > 0;
- });
+ bool perform_pad =
+ std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
- if(perform_pad)
+ if (perform_pad)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
}
@@ -81,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
}
void CLPadLayer::run()
{
- if(_perform_pad)
+ if (_perform_pad)
{
CLScheduler::get().enqueue(*_pad_kernel);
}
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index a56afff7df..7f97eed98a 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -27,22 +27,21 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClPermute.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
namespace arm_compute
{
struct CLPermute::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClPermute> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClPermute> op{nullptr};
};
-CLPermute::CLPermute()
- : _impl(std::make_unique<Impl>())
+CLPermute::CLPermute() : _impl(std::make_unique<Impl>())
{
}
@@ -53,7 +52,10 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu
configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
}
-void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+void CLPermute::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PermutationVector &perm)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_LOG_PARAMS(input, output, perm);
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 9d91e58367..6aa9d9cbb3 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClMul.h"
@@ -34,38 +35,55 @@ namespace arm_compute
{
struct CLPixelWiseMultiplication::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClMul> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClMul> op{nullptr};
};
-CLPixelWiseMultiplication::CLPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
-CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
CLPixelWiseMultiplication::~CLPixelWiseMultiplication() = default;
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy,
+ rounding_policy, act_info);
}
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClMul>();
- _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+ _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy,
+ rounding_policy, act_info);
}
-Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
}
@@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run()
struct CLComplexPixelWiseMultiplication::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClComplexMul> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClComplexMul> op{nullptr};
};
-CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default;
-
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLComplexPixelWiseMultiplication &
+CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default;
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClComplexMul::validate(input1, input2, output, act_info);
}
diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp
index 11ae1d0fe6..ce1092a7cc 100644
--- a/src/runtime/CL/functions/CLPooling3dLayer.cpp
+++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClPool3d.h"
@@ -32,14 +33,13 @@ namespace arm_compute
{
struct CLPooling3dLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- ICLTensor *indices{ nullptr };
- std::unique_ptr<opencl::ClPool3d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ ICLTensor *indices{nullptr};
+ std::unique_ptr<opencl::ClPool3d> op{nullptr};
};
-CLPooling3dLayer::CLPooling3dLayer()
- : _impl(std::make_unique<Impl>())
+CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>())
{
}
CLPooling3dLayer::~CLPooling3dLayer() = default;
@@ -49,7 +49,10 @@ void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, cons
configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info);
}
-void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info)
+void CLPooling3dLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Pooling3dLayerInfo &pool_info)
{
_impl->src = input;
_impl->dst = output;
@@ -58,7 +61,8 @@ void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const
_impl->op->configure(compile_context, input->info(), output->info(), pool_info);
}
-Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
{
return opencl::ClPool3d::validate(input, output, pool_info);
}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 0ebce318fa..65e53b9be3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClPool2d.h"
@@ -32,34 +33,44 @@ namespace arm_compute
{
struct CLPoolingLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- ICLTensor *indices{ nullptr };
- std::unique_ptr<opencl::ClPool2d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ ICLTensor *indices{nullptr};
+ std::unique_ptr<opencl::ClPool2d> op{nullptr};
};
-CLPoolingLayer::CLPoolingLayer()
- : _impl(std::make_unique<Impl>())
+CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>())
{
}
CLPoolingLayer::~CLPoolingLayer() = default;
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(ICLTensor *input,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info,
+ ICLTensor *indices)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
}
-void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info,
+ ICLTensor *indices)
{
_impl->src = input;
_impl->dst = output;
_impl->indices = indices;
_impl->op = std::make_unique<opencl::ClPool2d>();
- _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
+ _impl->op->configure(compile_context, input->info(), output->info(), pool_info,
+ (indices) ? indices->info() : nullptr);
}
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CLPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
return opencl::ClPool2d::validate(input, output, pool_info, indices);
}
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 019f0a7e61..cfd0ec4fbf 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -29,31 +29,40 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
using namespace arm_compute;
-CLPriorBoxLayer::CLPriorBoxLayer()
- : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
{
}
-void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
}
-void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
- _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
- _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
- if(!info.max_sizes().empty())
+ _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ info.min_sizes().size() * sizeof(float));
+ _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ info.aspect_ratios().size() * sizeof(float));
+ if (!info.max_sizes().empty())
{
- _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+ _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ info.max_sizes().size() * sizeof(float));
}
auto k = std::make_unique<CLPriorBoxLayerKernel>();
@@ -61,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I
_kernel = std::move(k);
}
-Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayer::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
}
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 7fbb866fa9..12f6f89290 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -26,29 +26,36 @@
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/QuantizationInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
using namespace arm_compute::utils::info_helpers;
using namespace arm_compute::opencl::kernels;
namespace
{
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
- float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ITensorInfo *mm_input,
+ const ITensorInfo *mm_weights,
+ const ITensorInfo *bias,
+ float gemmlowp_scale,
+ const TensorInfo *mm_res_info,
+ const TensorInfo *outstage_tensor_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
return Status{};
}
} // namespace
@@ -78,14 +85,12 @@ void CLQLSTMLayer::TensorCopyKernel::run()
_src->map(q, true);
_dst->map(q, true);
- Iterator input_iter{ _src, _window };
- Iterator output_iter{ _dst, _window };
+ Iterator input_iter{_src, _window};
+ Iterator output_iter{_dst, _window};
- execute_window_loop(_window, [&](const Coordinates &)
- {
- memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
- },
- input_iter, output_iter);
+ execute_window_loop(
+ _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+ output_iter);
_src->unmap(q);
_dst->unmap(q);
@@ -104,7 +109,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
_layer_norms(),
_copy_output()
{
- for(auto &norm : _layer_norms)
+ for (auto &norm : _layer_norms)
{
norm = std::make_unique<CLQLSTMLayerNormalizationKernel>();
}
@@ -129,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
{
// Output quantization scale will be different, but ignored here
// since it will be configured at configure() stage.
- const TensorInfo out
- {
- in
- };
+ const TensorInfo out{in};
return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
}
-void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
- const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
- CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
- const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context,
+ CLGEMMLowpMatrixMultiplyCore &mm,
+ CLGEMMLowpOutputStage &outstage,
+ GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ICLTensor *mm_input,
+ const ICLTensor *mm_weights,
+ const ICLTensor *bias,
+ CLTensor *mm_res,
+ CLTensor *outstage_res,
+ float gemmlowp_scale,
+ const TensorInfo &mm_res_info,
+ const TensorInfo &outstage_tensor_info)
{
_memory_group.manage(mm_res);
_memory_group.manage(outstage_res);
@@ -151,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML
mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
// Configure output stage
- quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
mm_res->allocator()->allocate();
}
-void CLQLSTMLayer::configure(const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out,
+ ICLTensor *output,
const LSTMParams<ICLTensor> &lstm_params)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
- cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+ output_state_in, cell_state_out, output_state_out, output, lstm_params);
}
-void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out,
+ ICLTensor *output,
const LSTMParams<ICLTensor> &lstm_params)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -191,11 +222,11 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
// Validate
- ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
- lstm_params_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+ output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info));
const int batch_size = input->info()->dimension(1);
const int num_units = input_to_output_weights->info()->dimension(1);
@@ -216,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
// Layer normalization
_has_layer_norm = lstm_params.use_layer_norm();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -238,53 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
_has_cell_clipping = quantized_cell_clip > 0;
// Precompute effective bias for optimizing the matmul computations.
- if(!_has_cifg)
+ if (!_has_cifg)
{
_input_to_input_weights = lstm_params.input_to_input_weights();
_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
- _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
- -qoutput_state_in.offset, true));
+ _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(),
+ _input_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_input_reduction->configure(
+ compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
}
- _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
- -qoutput_state_in.offset, true));
- _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
- true));
- _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
- -qoutput_state_in.offset, true));
- if(_has_projection)
+ _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(),
+ _input_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_forget_reduction->configure(
+ compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_cell_reduction->configure(
+ compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(),
+ _input_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_output_reduction->configure(
+ compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ if (_has_projection)
{
- _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
- if(_projection_bias != nullptr)
+ _projection_reduction->configure(
+ compile_context, _projection_weights->info(), _projection_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+ if (_projection_bias != nullptr)
{
- _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+ _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias,
+ &_projection_eff_bias, ConvertPolicy::SATURATE);
}
}
// Pre-transpose weights to be used in GEMM.
- _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
- _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
- _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
- _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
- _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
- _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
- if(!_has_cifg)
+ _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights,
+ &_input_to_forget_weights_transposed);
+ _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights,
+ &_input_to_cell_weights_transposed);
+ _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights,
+ &_input_to_output_weights_transposed);
+ _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights,
+ &_recurrent_to_forget_weights_transposed);
+ _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights,
+ &_recurrent_to_cell_weights_transposed);
+ _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights,
+ &_recurrent_to_output_weights_transposed);
+ if (!_has_cifg)
{
- _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
- _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+ _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(),
+ &_input_to_input_weights_transposed);
+ _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(),
+ &_recurrent_to_input_weights_transposed);
}
- if(_has_projection)
+ if (_has_projection)
{
_transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
}
@@ -297,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
// Forget gate.
- const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
- const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
- input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
- &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
- mm_out_info, forget_gate_outstage_info);
-
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+ &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+ &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
&_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
mm_out_info, forget_gate_outstage_info);
- _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res,
+ &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
ConvertPolicy::SATURATE);
_input_to_forget_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
_mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_forget_res);
- _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+ &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ _cell_to_forget_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_forget_outstage_res);
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+ const float cell_to_forget_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr,
+ &_cell_to_forget_outstage_res, gemmlowp_info);
_mul_cell_to_forget_res.allocator()->allocate();
- _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res,
+ &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
ConvertPolicy::SATURATE);
_cell_to_forget_outstage_res.allocator()->allocate();
}
CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
_recurrent_to_forget_outstage_res.allocator()->allocate();
@@ -345,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_forget_gate);
_forget_gate.allocator()->init(forget_gate_info);
- _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
forget_activation_input->allocator()->allocate();
// Modulation gate.
- const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
- input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
- &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
- mm_out_info, cell_outstage_info);
-
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
- &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
- mm_out_info, cell_outstage_info);
-
- _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+ const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input,
+ &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res,
+ &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info);
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+ &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
+
+ _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res,
+ &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
ConvertPolicy::SATURATE);
_input_to_cell_outstage_res.allocator()->allocate();
CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
_recurrent_to_cell_outstage_res.allocator()->allocate();
@@ -378,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_cell_gate);
_cell_gate.allocator()->init(cell_gate_info);
- _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
cell_activation_input->allocator()->allocate();
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_input_gate.allocator()->init(input_gate_info);
_memory_group.manage(&_input_gate);
- if(_has_cifg)
+ if (_has_cifg)
{
_ones.allocator()->init(*_forget_gate.info());
_input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -393,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
}
else
{
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
- input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
- &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
- mm_out_info, input_outstage_info);
-
- const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+ &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+ &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+ const float recurrent_to_input_scale =
+ _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
&_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
mm_out_info, input_outstage_info);
- _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
- ConvertPolicy::SATURATE);
+ _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_input_to_input_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
- _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+ _mul_cell_to_input_res.allocator()->init(
+ TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_input_res);
- _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+ &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ const float cell_to_input_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_input_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_input_outstage_res);
- _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+ _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr,
+ &_cell_to_input_outstage_res, gemmlowp_info);
_mul_cell_to_input_res.allocator()->allocate();
- _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_cell_to_input_outstage_res.allocator()->allocate();
}
CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
_recurrent_to_input_outstage_res.allocator()->allocate();
input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
}
- _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
input_activation_input->allocator()->allocate();
}
// Cell.
// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
- _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;
const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
- const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+ const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(mul_input_cell_scale, 0));
_memory_group.manage(&_mul_input_cell_res);
_mul_input_cell_res.allocator()->init(mul_input_cell_info);
- _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_gate.allocator()->allocate();
- _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+ _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out,
+ ConvertPolicy::SATURATE);
_mul_input_cell_res.allocator()->allocate();
_forget_gate.allocator()->allocate();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
- _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+ _cell_clip.configure(compile_context, cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip));
}
// Output gate.
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
- input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
- &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
- mm_out_info, output_outstage_info);
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.output_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+ &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+ &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
&_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
mm_out_info, output_outstage_info);
- _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+ _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+ &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
ConvertPolicy::SATURATE);
_input_to_output_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
_mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_output_res);
- _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
- const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(),
+ &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+
+ const float cell_to_output_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.output_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_output_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_output_outstage_res);
- _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+ _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr,
+ &_cell_to_output_outstage_res, gemmlowp_info);
_mul_cell_to_output_res.allocator()->allocate();
- _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+ _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+ &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
ConvertPolicy::SATURATE);
_cell_to_output_outstage_res.allocator()->allocate();
}
CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
_recurrent_to_output_outstage_res.allocator()->allocate();
@@ -503,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_output_gate);
_output_gate.allocator()->init(output_gate_info);
- _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
output_activation_input->allocator()->allocate();
// Hidden.
- _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
_memory_group.manage(&_hidden_mul_res);
const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
_hidden_mul_res.allocator()->init(hidden_mul_res);
- _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_output_gate.allocator()->allocate();
_input_gate.allocator()->allocate();
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = output_state_in->info()->data_type();
@@ -525,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
_memory_group.manage(&_hidden_gate);
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->init(*output_state_out->info());
_hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -536,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
_hidden_mul_res.allocator()->allocate();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
const TensorInfo projection_outstage_info(*output_state_out->info());
- const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
- gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
- gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
- gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
-
- TensorInfo projection_mm_out_info{ mm_out_info };
+ const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+ gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
+
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
- hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
- &_mm_projection_res, &_projection_outstage_res, projection_scale,
- projection_mm_out_info, projection_outstage_info);
+ configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+ &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+ &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
ICLTensor *accumulate_destination = output_state_out;
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->allocate();
_projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -565,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
accumulate_destination = &_projection_accumulate_res;
}
- _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+ _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination,
+ accumulate_destination, ConvertPolicy::SATURATE);
_projection_outstage_res.allocator()->allocate();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
_projection_accumulate_res.allocator()->allocate();
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
- quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+ quantized_projection_clip =
+ utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
- quantized_projection_clip));
+ _projection_clip.configure(compile_context, output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip));
_has_projection_clipping = true;
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
_hidden_gate.allocator()->allocate();
@@ -600,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
_copy_output.configure(compile_context, output_state_out, output);
}
-Status CLQLSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status CLQLSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *output,
const LSTMParams<ITensorInfo> &lstm_params)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
- recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
- cell_state_out, output_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ cell_state_in, output_state_in, cell_state_out, output_state_out, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -622,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+ input_to_cell_weights);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -647,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
// Check whether peephole weights are all there or none
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
}
}
@@ -674,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
@@ -682,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
// Precompute effective bias for optimizing the matmul computations.
const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
- true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.input_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- if(lstm_params.has_projection())
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_forget_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_cell_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_output_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
- lstm_params.hidden_state_zero(),
- true)));
- if(lstm_params.projection_bias() != nullptr)
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.projection_weights(), &projection_eff_bias_info,
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+ if (lstm_params.projection_bias() != nullptr)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
- &projection_eff_bias_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+ &projection_eff_bias_info, ConvertPolicy::SATURATE));
}
}
- const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
- const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+ const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1,
+ input_to_forget_weights->data_type(),
+ input_to_forget_weights->quantization_info());
+ const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_forget_weights->data_type(),
+ recurrent_to_forget_weights->quantization_info());
// Validate weights transpose
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
@@ -717,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
}
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
}
GEMMLowpOutputStageInfo gemmlowp_info;
@@ -738,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
// Forget gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
- const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
- const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_forget_scale, &mm_out_info, &forget_outstage_info));
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+ &forget_outstage_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_forget_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
const ITensorInfo *b_info = forget_gate_bias;
@@ -770,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Modulation gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
- const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
- if(has_layer_norm)
+ const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+ &cell_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+ &cell_outstage_info, ConvertPolicy::SATURATE));
+
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
const ITensorInfo *b_info = cell_bias;
@@ -791,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+ "Input gate bias must not be present when CIFG is used");
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+ &forget_gate_info, ConvertPolicy::SATURATE));
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+ input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+ lstm_params.recurrent_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
- const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
- if(lstm_params.has_peephole_opt())
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+ const float recurrent_to_input_scale =
+ lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+ &input_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
+
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+ 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_input_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
const ITensorInfo *b_info = lstm_params.input_gate_bias();
ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &input_outstage_info, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
}
// Cell.
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
- if(quantized_cell_clip > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+ if (quantized_cell_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
- quantized_cell_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip)));
}
// Output gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+ &output_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+ DataType::QSYMM16);
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
// Here we are not using the output stage because all operations are done in float
// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
const ITensorInfo *b_info = output_gate_bias;
@@ -877,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&output_outstage_info, &output_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Hidden.
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(cell_state_out, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = hidden_out_info.data_type();
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
const bool projection_tensor_copy_required = num_units != output_size;
// Projection.
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+ lstm_params.projection_weights());
ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
- const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
const TensorInfo projection_outstage_info(*output_state_out);
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
- TensorInfo projection_mm_out_info{ mm_out_info };
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+ &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
&projection_outstage_info));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+ ConvertPolicy::SATURATE));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
- quantized_projection_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip)));
}
}
else
{
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
}
}
- if(cell_state_out->total_size() > 0)
+ if (cell_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
}
- if(output_state_out->total_size() > 0)
+ if (output_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -980,14 +1210,14 @@ void CLQLSTMLayer::run()
_recurrent_to_forget_outstage.run();
_accumulate_input_recurrent_forget.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_forget.run();
_cell_to_forget_outstage.run();
_accumulate_cell_forget.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
}
@@ -1002,7 +1232,7 @@ void CLQLSTMLayer::run()
_recurrent_to_cell_outstage.run();
_accumulate_input_recurrent_modulation.run();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
}
@@ -1010,7 +1240,7 @@ void CLQLSTMLayer::run()
_cell_gate_tanh.run();
// Input gate
- if(_has_cifg)
+ if (_has_cifg)
{
_input_gate_sub.run();
}
@@ -1022,14 +1252,14 @@ void CLQLSTMLayer::run()
_recurrent_to_input_outstage.run();
_accumulate_input_recurrent_input.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_input.run();
_cell_to_input_outstage.run();
_accumulate_cell_input.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
}
@@ -1041,7 +1271,7 @@ void CLQLSTMLayer::run()
_pixelwise_mul_forget_cell.run();
_pixelwise_mul_input_cell.run();
_add_forget_cell.run();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
_cell_clip.run();
}
@@ -1052,14 +1282,14 @@ void CLQLSTMLayer::run()
_mm_recurrent_to_output.run();
_recurrent_to_output_outstage.run();
_accumulate_input_recurrent_output.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_output.run();
_cell_to_output_outstage.run();
_accumulate_cell_to_output.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
}
@@ -1072,31 +1302,31 @@ void CLQLSTMLayer::run()
_hidden_outstage.run();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
_mm_projection.run();
_projection_outstage.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_output_to_accumulate_copy.run();
}
_accumulate_projection.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.run();
}
- if(_has_projection_clipping)
+ if (_has_projection_clipping)
{
_projection_clip.run();
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.run();
}
@@ -1108,7 +1338,7 @@ void CLQLSTMLayer::run()
void CLQLSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Pre-transpose weights to be used in GEMM.
_input_to_forget_weights_transposed.allocator()->allocate();
@@ -1125,10 +1355,11 @@ void CLQLSTMLayer::prepare()
_transpose_recurrent_to_output_weights.run();
// Precompute effective biases
- if(_has_cifg)
+ if (_has_cifg)
{
_ones.map(true);
- std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+ std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 32767);
_ones.unmap();
}
else
@@ -1136,10 +1367,12 @@ void CLQLSTMLayer::prepare()
_input_to_input_eff_bias.allocator()->allocate();
_recurrent_to_input_eff_bias.allocator()->allocate();
- ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } };
+ ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights},
+ {ACL_DST, &_input_to_input_eff_bias}};
CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
- ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } };
+ ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights},
+ {ACL_DST, &_recurrent_to_input_eff_bias}};
CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
_input_to_input_weights_transposed.allocator()->allocate();
@@ -1156,30 +1389,35 @@ void CLQLSTMLayer::prepare()
_input_to_output_eff_bias.allocator()->allocate();
_recurrent_to_output_eff_bias.allocator()->allocate();
- ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } };
+ ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights},
+ {ACL_DST, &_input_to_forget_eff_bias}};
CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
- ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } };
+ ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights},
+ {ACL_DST, &_recurrent_to_forget_eff_bias}};
CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
- ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } };
+ ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}};
CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
- ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } };
+ ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights},
+ {ACL_DST, &_recurrent_to_cell_eff_bias}};
CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
- ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } };
+ ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights},
+ {ACL_DST, &_input_to_output_eff_bias}};
CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
- ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } };
+ ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights},
+ {ACL_DST, &_recurrent_to_output_eff_bias}};
CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
- if(_has_projection)
+ if (_has_projection)
{
_projection_eff_bias.allocator()->allocate();
- ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } };
+ ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}};
CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
- if(_projection_bias != nullptr)
+ if (_projection_bias != nullptr)
{
_projection_bias_add.run();
_projection_bias->mark_as_unused();
@@ -1189,7 +1427,7 @@ void CLQLSTMLayer::prepare()
_transpose_projection_weights.run();
_projection_weights->mark_as_unused();
- if(!_projection_tensor_copy_required)
+ if (!_projection_tensor_copy_required)
{
_hidden_gate.mark_as_unused();
_projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index b249bdd1db..6edef29992 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClQuantize.h"
@@ -32,13 +33,12 @@ namespace arm_compute
{
struct CLQuantizationLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClQuantize> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClQuantize> op{nullptr};
};
-CLQuantizationLayer::CLQuantizationLayer()
- : _impl(std::make_unique<Impl>())
+CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>())
{
}
CLQuantizationLayer::~CLQuantizationLayer() = default;
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 6f122866b2..34b78eefa7 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,24 +28,37 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
namespace arm_compute
{
using namespace arm_compute::misc::shape_calculator;
CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(),
+ : _memory_group(std::move(memory_manager)),
+ _gemm_state_f(),
+ _add_kernel(),
+ _activation(),
+ _fully_connected_kernel(),
+ _copy(),
+ _fully_connected_out(),
+ _gemm_output(),
+ _add_output(),
_is_prepared(false)
{
}
CLRNNLayer::~CLRNNLayer() = default;
-Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
- const ITensorInfo *output, const ActivationLayerInfo &info)
+Status CLRNNLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *hidden_state,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -63,28 +76,42 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
- auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+ auto shape_info =
+ TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
return Status{};
}
-void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+void CLRNNLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *recurrent_weights,
+ const ICLTensor *bias,
+ ICLTensor *hidden_state,
+ ICLTensor *output,
ActivationLayerInfo &info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state,
+ output, info);
}
-void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
- ICLTensor *hidden_state,
- ICLTensor *output, ActivationLayerInfo &info)
+void CLRNNLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *recurrent_weights,
+ const ICLTensor *bias,
+ ICLTensor *hidden_state,
+ ICLTensor *output,
+ ActivationLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+ bias->info(), hidden_state->info(), output->info(), info));
ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -133,7 +160,7 @@ void CLRNNLayer::run()
void CLRNNLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_fully_connected_kernel.prepare();
_gemm_state_f.prepare();
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 867ef7c7ac..1939d1d0ba 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -24,26 +24,36 @@
#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
#include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
namespace arm_compute
{
-Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
return Status{};
}
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 239a1c6bb2..0d2eab0c76 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -22,24 +22,35 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
#include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
using namespace arm_compute;
-Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info);
}
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 3fbbd5f952..5c3f7f9c8c 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -27,9 +27,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLRangeKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
using namespace arm_compute;
@@ -38,7 +38,8 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c
configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
}
-void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRange::configure(
+ const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
{
ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
auto k = std::make_unique<CLRangeKernel>();
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index cddbf77d7c..6c6daff5ba 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -27,23 +27,25 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLReductionOperationKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
namespace
{
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
@@ -51,29 +53,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
const int input_dims = input->num_dimensions();
Coordinates axis_local = reduction_axis;
- for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
{
//axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
}
- if(output->tensor_shape().total_size() != 0)
+ if (output->tensor_shape().total_size() != 0)
{
// Only validate if not using auto_init for the output tensor
TensorShape out_shape = input->tensor_shape();
// Validate output_shape only if not using auto_init
convert_negative_axis(axis_local, input_dims);
std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
+ for (unsigned int i = 0; i < reduction_ops; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
+ if (output->total_size() > 0 && keep_dims)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
}
- if(keep_dims)
+ if (keep_dims)
{
out_shape.set(axis_local[i], 1);
}
@@ -87,8 +89,9 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
}
const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
- const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
- if(requant)
+ const bool requant =
+ is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+ if (requant)
{
TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
CLDequantizationLayer::validate(input, &input_no_quant);
@@ -98,10 +101,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
}
return Status{};
}
-}
+} // namespace
CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+ : _memory_group(std::move(memory_manager)),
+ _reduction_kernels(),
+ _reduced_outs(),
+ _reshape(),
+ _dequant(),
+ _requant(),
+ _reduction_ops(),
+ _keep_dims(),
+ _do_requant(),
+ _input_no_quant(),
_output_no_quant()
{
}
@@ -111,17 +123,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
}
-void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+void CLReduceMean::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const Coordinates &reduction_axis,
+ bool keep_dims,
+ ICLTensor *output)
{
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
+ _do_requant = is_data_type_quantized(input->info()->data_type()) &&
+ input->info()->quantization_info() != output->info()->quantization_info();
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -129,7 +147,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
ICLTensor *tmp_input = input;
ICLTensor *tmp_output = output;
- if(_do_requant)
+ if (_do_requant)
{
_memory_group.manage(&_input_no_quant);
_memory_group.manage(&_output_no_quant);
@@ -148,46 +166,51 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(int i = 0; i < _reduction_ops; ++i)
+ for (int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ TensorShape out_shape =
+ i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
- if(i == _reduction_ops - 1 && keep_dims)
+ if (i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i],
+ ReductionOperation::MEAN_SUM);
}
else
{
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(),
+ tmp_input->info()->data_type(),
+ tmp_input->info()->quantization_info()));
_memory_group.manage(&_reduced_outs[i]);
- _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i],
+ ReductionOperation::MEAN_SUM);
}
}
// Allocate intermediate tensors
- for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
// Configure reshape layer if we want to drop the dimensions
- if(!_keep_dims)
+ if (!_keep_dims)
{
TensorShape out_shape = tmp_input->info()->tensor_shape();
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(int i = 0; i < _reduction_ops; ++i)
+ for (int i = 0; i < _reduction_ops; ++i)
{
out_shape.remove_dimension(axis_local[i] - i, false);
}
auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
}
- if(_do_requant)
+ if (_do_requant)
{
_requant.configure(compile_context, &_output_no_quant, output);
_input_no_quant.allocator()->allocate();
@@ -195,7 +218,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
}
}
-Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status CLReduceMean::validate(const ITensorInfo *input,
+ const Coordinates &reduction_axis,
+ bool keep_dims,
+ const ITensorInfo *output)
{
return validate_config(input, reduction_axis, keep_dims, output);
}
@@ -204,19 +230,19 @@ void CLReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_do_requant)
+ if (_do_requant)
{
_dequant.run();
}
- for(auto &kernel : _reduction_kernels)
+ for (auto &kernel : _reduction_kernels)
{
kernel.run();
}
- if(!_keep_dims)
+ if (!_keep_dims)
{
_reshape.run();
}
- if(_do_requant)
+ if (_do_requant)
{
_requant.run();
}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index cdc7fec51b..ba5489018e 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -27,35 +27,43 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLReductionOperationKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/runtime/Utils.h"
-#include "src/common/utils/Log.h"
-
namespace arm_compute
{
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false)
+ : _memory_group(std::move(memory_manager)),
+ _unreshaped_output(),
+ _reduction_kernel(),
+ _reshape(),
+ _reduction_axis(),
+ _is_reshape_required(false)
{
}
CLReductionOperation::~CLReductionOperation() = default;
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status CLReductionOperation::validate(
+ const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
const bool is_reshape_required = !keep_dims;
- if(is_reshape_required && output->total_size() != 0)
+ if (is_reshape_required && output->total_size() != 0)
{
- const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
}
@@ -67,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
const auto input_qinfo = input->quantization_info();
const auto output_data_type = output->data_type();
- auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
- {
+ auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+ QuantizationInfo qinfo) {
ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
};
- if(is_reshape_required)
+ if (is_reshape_required)
{
auto shape_before_reshape = input_shape;
shape_before_reshape.set(axis, 1);
- initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+ initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles,
+ input_qinfo);
output_internal = &output_before_reshape;
}
ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
- if(is_reshape_required)
+ if (is_reshape_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
}
@@ -92,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
{
- if(!_is_reshape_required)
+ if (!_is_reshape_required)
{
return output;
}
@@ -103,12 +112,18 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor
return &_unreshaped_output;
}
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(
+ ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
}
-void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op,
+ bool keep_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -117,11 +132,17 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
auto *output_internal = configure_intermediate_result_vector(input, output);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
- const auto output_data_type = input->info()->data_type();
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ const auto output_data_type = input->info()->data_type();
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
_memory_group.manage(&_unreshaped_output);
}
@@ -129,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
_reduction_kernel = std::make_unique<CLReductionOperationKernel>();
_reduction_kernel->configure(compile_context, input, output_internal, axis, op);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.configure(compile_context, &_unreshaped_output, output);
_unreshaped_output.allocator()->allocate();
@@ -142,7 +163,7 @@ void CLReductionOperation::run()
CLScheduler::get().enqueue(*_reduction_kernel, false);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.run();
}
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 15de959225..156e9b90c1 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -27,9 +27,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLReorgLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
#include <utility>
@@ -40,7 +40,10 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride
configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
}
-void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ int32_t stride)
{
ARM_COMPUTE_LOG_PARAMS(input, output, stride);
auto k = std::make_unique<CLReorgLayerKernel>();
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index c51a3298c1..3d6349fb25 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClReshape.h"
@@ -35,17 +36,16 @@ namespace arm_compute
{
struct CLReshapeLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClReshape> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClReshape> op{nullptr};
};
-CLReshapeLayer::CLReshapeLayer()
- : _impl(std::make_unique<Impl>())
+CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>())
{
}
-CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
CLReshapeLayer::~CLReshapeLayer() = default;
@@ -78,4 +78,4 @@ void CLReshapeLayer::run()
_impl->op->run(pack);
}
} // namespace arm_compute
-/** [CLReshapeLayer snippet] **/ \ No newline at end of file
+ /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 1fc93571d9..415de52e64 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -24,9 +24,9 @@
#include "arm_compute/runtime/CL/functions/CLReverse.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLReverseKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
namespace arm_compute
{
@@ -35,7 +35,10 @@ void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTe
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
}
-void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis)
{
ARM_COMPUTE_LOG_PARAMS(input, output, axis);
auto k = std::make_unique<CLReverseKernel>();
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 5b78989bfa..abff0724e4 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClScale.h"
@@ -33,13 +34,12 @@ namespace arm_compute
{
struct CLScale::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClScale> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClScale> op{nullptr};
};
-CLScale::CLScale()
- : _impl(std::make_unique<Impl>())
+CLScale::CLScale() : _impl(std::make_unique<Impl>())
{
}
CLScale::~CLScale() = default;
@@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn
configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
}
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+void CLScale::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ScaleKernelInfo &info)
{
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c4ab3dc67a..b4897d9e62 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -25,9 +25,9 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSelectKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
using namespace arm_compute;
@@ -38,7 +38,11 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor
configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
}
-void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelect::configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output)
{
ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
auto k = std::make_unique<CLSelectKernel>();
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index 7e3ac7d769..f79c6a1235 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -26,15 +26,19 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn
_kernel = std::move(k);
}
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
// Check start dimensions for being non-negative
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
- {
- return i < 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
struct CLSlice::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<experimental::CLSlice> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<experimental::CLSlice> op{nullptr};
};
-CLSlice::CLSlice()
- : _impl(std::make_unique<Impl>())
+CLSlice::CLSlice() : _impl(std::make_unique<Impl>())
{
}
-CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice::CLSlice(CLSlice &&) = default;
CLSlice &CLSlice::operator=(CLSlice &&) = default;
CLSlice::~CLSlice() = default;
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
return experimental::CLSlice::validate(input, output, starts, ends);
}
@@ -89,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin
configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
}
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d52352fc8d..2e70e2aa08 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -22,12 +22,14 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
#include "src/gpu/cl/operators/ClPermute.h"
@@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax;
template <bool IS_LOG>
struct CLSoftmaxLayerGeneric<IS_LOG>::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
MemoryGroup memory_group{};
ITensorPack run_pack{};
WorkspaceData<CLTensor> workspace_tensors{};
@@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor
}
template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(
+ const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
{
_impl->src = input;
_impl->dst = output;
_impl->op = std::make_unique<OperatorType>();
- SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis };
+ SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis};
_impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info);
- _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
_impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
{
- SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis };
+ SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis};
return OperatorType::validate(*input, *output, softmax_info);
}
template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
{
// Acquire all the temporaries
MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 3b7083400b..37f728895f 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -29,71 +29,100 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
namespace arm_compute
{
CLSpaceToBatchLayer::CLSpaceToBatchLayer()
- : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()),
- _fill(),
- _has_padding(false)
+ : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false)
{
}
CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
}
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+ _fill.configure(compile_context, output,
+ PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
_space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
}
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+ padding_right, output);
}
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
- const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+ _fill.configure(compile_context, output,
+ PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
- _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right,
+ output);
}
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
return Status{};
}
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -101,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
void CLSpaceToBatchLayer::run()
{
// Zero out output only if we have paddings
- if(_has_padding)
+ if (_has_padding)
{
//CLScheduler::get().enqueue(*_fill, true);
_fill.run();
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 67dafff47f..22695c9ef3 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -29,14 +29,13 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
namespace arm_compute
{
-CLSpaceToDepthLayer::CLSpaceToDepthLayer()
- : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
+CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
{
}
@@ -47,7 +46,10 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
_space_to_depth_kernel->configure(compile_context, input, output, block_shape);
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index 0b27371e3f..6be43cc5cd 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/helpers/AutoConfiguration.h"
namespace arm_compute
@@ -38,7 +39,7 @@ void CLSplit::run()
{
cl::CommandQueue q = CLScheduler::get().queue();
- for(unsigned i = 0; i < _num_outputs; ++i)
+ for (unsigned i = 0; i < _num_outputs; ++i)
{
_slice_functions[i].run();
}
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 6a335da00c..c15496fc31 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -21,8 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include <complex>
-
#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -32,16 +30,16 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLStackLayerKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+
+#include <complex>
namespace arm_compute
{
CLStackLayer::CLStackLayer() // NOLINT
- : _input(),
- _stack_kernels(),
- _num_inputs(0)
+ : _input(), _stack_kernels(), _num_inputs(0)
{
}
@@ -52,7 +50,10 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC
configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
}
-void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+void CLStackLayer::configure(const CLCompileContext &compile_context,
+ const std::vector<ICLTensor *> &input,
+ int axis,
+ ICLTensor *output)
{
ARM_COMPUTE_LOG_PARAMS(input, axis, output);
_num_inputs = input.size();
@@ -61,7 +62,7 @@ void CLStackLayer::configure(const CLCompileContext &compile_context, const std:
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
- for(unsigned int i = 0; i < _num_inputs; i++)
+ for (unsigned int i = 0; i < _num_inputs; i++)
{
_stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>());
_stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
@@ -79,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
const unsigned int num_inputs = input.size();
- for(unsigned int i = 0; i < num_inputs; i++)
+ for (unsigned int i = 0; i < num_inputs; i++)
{
// All the tensors must have the same rank
ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -92,7 +93,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
void CLStackLayer::run()
{
- for(unsigned i = 0; i < _num_inputs; i++)
+ for (unsigned i = 0; i < _num_inputs; i++)
{
CLScheduler::get().enqueue(*_stack_kernels[i], false);
}
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index 261bdc13d1..c1953cc415 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -25,17 +25,23 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
auto k = std::make_unique<CLStridedSliceKernel>();
@@ -43,9 +49,14 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IT
_kernel = std::move(k);
}
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
}
@@ -53,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
struct CLStridedSlice::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- CLRuntimeContext *ctx{ nullptr };
- std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ CLRuntimeContext *ctx{nullptr};
+ std::unique_ptr<experimental::CLStridedSlice> op{nullptr};
};
-CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
- : _impl(std::make_unique<Impl>())
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
{
_impl->ctx = ctx;
}
-CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
CLStridedSlice::~CLStridedSlice() = default;
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask,
+ shrink_axis_mask);
}
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -86,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC
_impl->dst = output;
_impl->op = std::make_unique<experimental::CLStridedSlice>();
- _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask,
+ end_mask, shrink_axis_mask);
}
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+ shrink_axis_mask);
}
void CLStridedSlice::run()
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index ef790995f9..4f86c4adfa 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -23,9 +23,8 @@
*/
#include "arm_compute/runtime/CL/functions/CLTile.h"
-#include "src/core/CL/kernels/CLTileKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
namespace arm_compute
{
@@ -34,7 +33,10 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple
configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
}
-void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTile::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples)
{
ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
auto k = std::make_unique<CLTileKernel>();
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index e63c92eeb4..5a738f47ce 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/gpu/cl/operators/ClTranspose.h"
@@ -34,12 +35,11 @@ namespace arm_compute
{
struct CLTranspose::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClTranspose> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClTranspose> op{nullptr};
};
-CLTranspose::CLTranspose()
- : _impl(std::make_unique<Impl>())
+CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>())
{
}
CLTranspose::~CLTranspose() = default;
@@ -70,4 +70,4 @@ void CLTranspose::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 98d47810ab..ddd83e7824 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -40,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
}
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start,
+ int32_t &slice_end_mask,
+ const unsigned int input_num_dimensions)
{
// Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
Coordinates slice_end;
slice_start.set_num_dimensions(input_num_dimensions);
slice_end.set_num_dimensions(input_num_dimensions);
- for(size_t k = 0; k < input_num_dimensions; ++k)
+ for (size_t k = 0; k < input_num_dimensions; ++k)
{
slice_start.set(k, 0);
slice_end.set(k, -1);
@@ -56,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
} // namespace
CLUnstack::CLUnstack() // NOLINT
- : _num_slices(0),
- _strided_slice_vector()
+ : _num_slices(0), _strided_slice_vector()
{
}
@@ -66,15 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *>
configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
}
-void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+void CLUnstack::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const std::vector<ICLTensor *> &output_vector,
+ int axis)
{
ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
- std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(t);
- return t->info();
- });
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+ [](ICLTensor *t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -87,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens
Coordinates slice_start;
int32_t slice_end_mask;
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
- for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ for (unsigned int slice = 0; slice < _num_slices; ++slice)
{
// Adjusts start and end coordinates to take a 2D slice at a time
slice_start.set(axis_u, slice);
- _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(),
+ BiStrides(), 0, slice_end_mask, (1 << axis_u));
}
}
@@ -106,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
Coordinates slice_start;
int32_t slice_end_mask;
- for(size_t k = 0; k < num_slices; ++k)
+ for (size_t k = 0; k < num_slices; ++k)
{
slice_start.set(wrap_axis(axis, input), k);
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
- ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+ BiStrides(), 0, slice_end_mask,
+ (1 << wrap_axis(axis, input))));
}
return Status{};
}
void CLUnstack::run()
{
- for(unsigned i = 0; i < _num_slices; ++i)
+ for (unsigned i = 0; i < _num_slices; ++i)
{
_strided_slice_vector[i].run();
}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index b416d0fcf1..645f817030 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClWinogradConv2d.h"
@@ -35,15 +36,15 @@ namespace arm_compute
{
struct CLWinogradConvolutionLayer::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *weights{ nullptr };
- const ICLTensor *biases{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr};
ITensorPack run_pack{};
MemoryGroup memory_group{};
WorkspaceData<CLTensor> workspace_tensors{};
- bool is_prepared{ false };
+ bool is_prepared{false};
};
CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryMa
CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
- bool enable_fast_math)
+void CLWinogradConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+ enable_fast_math);
}
-void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
_impl->src = input;
_impl->weights = weights;
@@ -70,20 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClWinogradConv2d>();
- _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
+ enable_fast_math);
- _impl->run_pack =
- {
- { TensorType::ACL_SRC_0, _impl->src },
- { TensorType::ACL_SRC_1, _impl->weights },
- { TensorType::ACL_SRC_2, _impl->biases },
- { TensorType::ACL_DST, _impl->dst }
- };
- _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+ _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
+ {TensorType::ACL_SRC_1, _impl->weights},
+ {TensorType::ACL_SRC_2, _impl->biases},
+ {TensorType::ACL_DST, _impl->dst}};
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
-Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
}
@@ -97,7 +114,7 @@ void CLWinogradConvolutionLayer::run()
void CLWinogradConvolutionLayer::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
_impl->op->prepare(_impl->run_pack);
@@ -107,4 +124,4 @@ void CLWinogradConvolutionLayer::prepare()
_impl->is_prepared = true;
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
index 18ade97885..4270165ab4 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include <map>
@@ -34,8 +35,7 @@ namespace arm_compute
{
namespace cl_gemm
{
-CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu)
- : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
{
}
@@ -44,109 +44,109 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec
// _target could be used in the future to have a dedicated heuristic for each GPU IP
ARM_COMPUTE_UNUSED(_target);
- using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
// Default configurations for Bifrost architectures
- static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
// Mali-G71 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
// Mali-G52 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
// Mali-G76 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
const DataType data_type = params.data_type;
- switch(_target)
+ switch (_target)
{
case GPUTarget::G71:
- if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
+ if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
{
- return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G76:
- if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
+ if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
{
- return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G52:
- if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
+ if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
{
- return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
default:
- if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+ if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
{
- return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE;
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
- if((m > 1) && (n < 16))
+ if ((m > 1) && (n < 16))
{
gemm_type = CLGEMMKernelType::RESHAPED;
}
- else if(m == 1)
+ else if (m == 1)
{
gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if((k > 256) && (m > 4))
+ if ((k > 256) && (m > 4))
{
constexpr float alpha = 3.2f;
constexpr float fact0 = 1.51f;
constexpr float fact1 = 1.66f;
constexpr float ops = 12.0f;
const float scale = k > 1024 ? 1.07f : 1.0f;
- gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::RESHAPED_ONLY_RHS;
+ gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops))
+ ? CLGEMMKernelType::RESHAPED
+ : CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
@@ -156,19 +156,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned
const auto workload = static_cast<float>((m * n) / 20.0f);
- gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED : gemm_type;
+ gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED
+ : gemm_type;
}
return gemm_type;
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(n, k, b);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -183,11 +185,12 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
- if(k <= 496)
+ if (k <= 496)
{
- if(n <= 544)
+ if (n <= 544)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
}
else
{
- if(k <= 588)
+ if (k <= 588)
{
- if(k <= 552)
+ if (k <= 552)
{
- if(m <= 148)
+ if (m <= 148)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 278)
+ if (m <= 278)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
const float r_nk = static_cast<float>(n) / static_cast<float>(k);
const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k));
- if(r_mn <= 1.5469f)
+ if (r_mn <= 1.5469f)
{
- if(r_mk <= 0.8766f)
+ if (r_mk <= 0.8766f)
{
- if(r_mk <= 0.0211f)
+ if (r_mk <= 0.0211f)
{
- if(r_mnk <= 77.5833f)
+ if (r_mnk <= 77.5833f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_nk <= 0.0832f)
+ if (r_nk <= 0.0832f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_mnk <= 193.0000f)
+ if (r_mnk <= 193.0000f)
{
- if(r_mn <= 0.9948f)
+ if (r_mn <= 0.9948f)
{
- if(r_mk <= 2.5453f)
+ if (r_mk <= 2.5453f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_mn <= 17.7370f)
+ if (r_mn <= 17.7370f)
{
- if(r_mnk <= 1391.2875f)
+ if (r_mnk <= 1391.2875f)
{
- if(r_mk <= 2.9724f)
+ if (r_mk <= 2.9724f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(r_mnk <= 470.0000f)
+ if (r_mnk <= 470.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_nk <= 0.1381f)
+ if (r_nk <= 0.1381f)
{
- if(r_mnk <= 9040.5000f)
+ if (r_mnk <= 9040.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_mn <= 5.6790f)
+ if (r_mn <= 5.6790f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
const float r_mn = static_cast<float>(m) / static_cast<float>(n);
const float r_nk = static_cast<float>(n) / static_cast<float>(k);
- if(k <= 212)
+ if (k <= 212)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(r_nk <= 0.4990234375f)
+ if (r_nk <= 0.4990234375f)
{
- if(k <= 1392)
+ if (k <= 1392)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 325)
+ if (m <= 325)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
}
else
{
- if(k <= 471)
+ if (k <= 471)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(r_mn <= 0.04475911520421505f)
+ if (r_mn <= 0.04475911520421505f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
- if(n <= 127.0000f)
+ if (n <= 127.0000f)
{
- if(n <= 63.5000f)
+ if (n <= 63.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 3616.0000f)
+ if (m <= 3616.0000f)
{
- if(b <= 18.5000f)
+ if (b <= 18.5000f)
{
- if(m <= 2970.5000f)
+ if (m <= 2970.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(k <= 104.0000f)
+ if (k <= 104.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
}
else
{
- if(m <= 12.5000f)
+ if (m <= 12.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(k <= 104.0000f)
+ if (k <= 104.0000f)
{
- if(b <= 18.5000f)
+ if (b <= 18.5000f)
{
- if(m <= 490.0000f)
+ if (m <= 490.0000f)
{
- if(n <= 272.0000f)
+ if (n <= 272.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
}
else
{
- if(m <= 226.0000f)
+ if (m <= 226.0000f)
{
- if(n <= 140.0000f)
+ if (n <= 140.0000f)
{
- if(m <= 179.5000f)
+ if (m <= 179.5000f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -556,15 +563,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
ARM_COMPUTE_UNUSED(n);
ARM_COMPUTE_UNUSED(k);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
index ef30b28f96..673038a8db 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/GPUTarget.h"
+
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include <map>
@@ -35,8 +36,7 @@ namespace arm_compute
{
namespace cl_gemm
{
-CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu)
- : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
{
}
@@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
// _target could be used in the future to have a dedicated heuristic for each GPU IP
ARM_COMPUTE_UNUSED(_target);
- using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
// Configurations for Midgard architectures
- static std::map<DataType, FunctionExecutorPtr> gemm_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}};
const DataType data_type = params.data_type;
- if(gemm_configs.find(data_type) != gemm_configs.end())
+ if (gemm_configs.find(data_type) != gemm_configs.end())
{
return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
}
@@ -68,7 +67,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
ARM_COMPUTE_ERROR("Not supported data type");
}
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(n, k, b);
@@ -76,7 +76,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned
return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(n, k, b);
@@ -84,7 +85,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned
return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant);
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
index 9e779d3752..851e23bc84 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include <map>
@@ -34,8 +35,7 @@ namespace arm_compute
{
namespace cl_gemm
{
-CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu)
- : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
{
}
@@ -44,135 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec
// _target could be used in the future to have a dedicated heuristic for each GPU IP
ARM_COMPUTE_UNUSED(_target);
- using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
// Default configurations for Valhall architectures
- static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
// Mali-G77 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
// Mali-G78 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
// Mali-G710 and Mali-G610 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
// Mali-G715 and Mali-G615 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
const DataType data_type = params.data_type;
- switch(_target)
+ switch (_target)
{
case GPUTarget::G710:
case GPUTarget::G610:
- if(gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
+ if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
{
- return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G715:
case GPUTarget::G615:
- if(gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
+ if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
{
- return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G78:
- if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
+ if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
{
- return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G77:
- if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
+ if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
{
- return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
default:
- if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+ if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
{
- return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
}
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -182,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i
}
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
- if(n <= 272.0000f)
+ if (n <= 272.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(k <= 471.0000f)
+ if (k <= 471.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 72.5000f)
+ if (m <= 72.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 90.5000f)
+ if (m <= 90.5000f)
{
return CLGEMMKernelType::RESHAPED;
}
else
{
- if(k <= 2448.0000f)
+ if (k <= 2448.0000f)
{
- if(n <= 756.0000f)
+ if (n <= 756.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -241,11 +243,12 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return CLGEMMKernelType::NATIVE;
}
@@ -253,9 +256,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return default_f32(m, n, k, b, is_rhs_constant);
}
@@ -263,7 +267,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int
unsigned int best_m0;
unsigned int best_n0;
- if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+ if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
}
@@ -273,9 +277,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
return g78_f16(m, n, k, b, is_rhs_constant);
}
@@ -283,7 +288,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int
unsigned int best_m0;
unsigned int best_n0;
- if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+ if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
}
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
index 6189a324cf..c528dbcac4 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -25,6 +25,7 @@
#define SRC_CLGEMMKERNELSELECTION_H
#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
#include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h"
#include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h"
#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
@@ -45,7 +46,7 @@ public:
*/
static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
{
- switch(get_arch_from_target(gpu))
+ switch (get_arch_from_target(gpu))
{
case GPUTarget::MIDGARD:
return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu);
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
index b06c3b0f8e..8df57197e2 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
#include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
@@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_
bool valid = false;
CLGEMMKernelType gemm_type{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.",
+ to_string(gemm_type).c_str());
}
else
{
@@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery
{
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
- std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
+ std::unique_ptr<IClGemmKernelConfig> gemm_config =
+ ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
- return GEMMConfigResult{ true, lhs_info, rhs_info };
+ return GEMMConfigResult{true, lhs_info, rhs_info};
}
GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
@@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu
GEMMRHSMatrixInfo rhs_info;
mlgo::GEMMConfigReshapedOnlyRHS config{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+ to_string(config).c_str());
// Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
- std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs,
- config.export_cl_image);
+ std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+ query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs,
+ !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
}
else
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
}
- return GEMMConfigResult{ valid, lhs_info, rhs_info };
+ return GEMMConfigResult{valid, lhs_info, rhs_info};
}
GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
{
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
- std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
+ std::unique_ptr<IClGemmKernelConfig> gemm_config =
+ ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
- return GEMMConfigResult{ true, lhs_info, rhs_info };
+ return GEMMConfigResult{true, lhs_info, rhs_info};
}
GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
@@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
GEMMRHSMatrixInfo rhs_info;
mlgo::GEMMConfigReshaped config{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
- std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs,
- config.transpose_rhs, config.export_cl_image);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+ to_string(config).c_str());
+ std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+ query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs,
+ config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
}
else
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
}
- return GEMMConfigResult{ valid, lhs_info, rhs_info };
+ return GEMMConfigResult{valid, lhs_info, rhs_info};
}
GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
@@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target);
ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
- return GEMMConfigResult{ true, lhs_info, rhs_info };
+ return GEMMConfigResult{true, lhs_info, rhs_info};
}
GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
@@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
GEMMRHSMatrixInfo rhs_info;
mlgo::GEMMConfigNative config{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+ to_string(config).c_str());
// Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
- std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
+ std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(
+ query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
}
else
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
}
- return GEMMConfigResult{ valid, lhs_info, rhs_info };
+ return GEMMConfigResult{valid, lhs_info, rhs_info};
}
} // namespace auto_heuristics
} // namespace cl_gemm
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
index 020237b7f4..f544715e03 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
@@ -50,8 +50,7 @@ struct CommonQuery
/** Result of querying about GEMM type ( @ref CLGEMMKernelType) */
struct GEMMTypeResult
{
- GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type)
- : valid{ valid }, gemm_type{ gemm_type }
+ GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type}
{
}
/** Test if the result is valid */
@@ -67,7 +66,7 @@ struct GEMMTypeResult
struct GEMMConfigResult
{
GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info)
- : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info }
+ : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info}
{
}
/** Test if the result is valid */
@@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query);
} // namespace cl_gemm
} // namespace arm_compute
-#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H \ No newline at end of file
+#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h
index c451bd9062..08a7ee8c18 100644
--- a/src/runtime/CL/mlgo/Common.h
+++ b/src/runtime/CL/mlgo/Common.h
@@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType;
/** GEMM Configuration for Native kernel */
struct GEMMConfigNative
{
- unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
- unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
- unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
+ unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+ unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+ unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
};
/** GEMM Configuration for Reshaped Only RHS kernel */
struct GEMMConfigReshapedOnlyRHS
{
- unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
- unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
- unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
- unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
- bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
- bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */
- bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+ unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+ unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+ unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+ unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+ bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+ bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */
+ bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
};
/** GEMM Configuration for Reshaped kernel */
struct GEMMConfigReshaped
{
- unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
- unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
- unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
- unsigned int v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
- unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
- bool interleave_lhs{ false }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
- bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
- bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */
- bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+ unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+ unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+ unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+ unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+ unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+ bool interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+ bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+ bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */
+ bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
};
} // namespace mlgo
} // namespace arm_compute
-#endif // SRC_RUNTIME_CL_MLGO_COMMON_H \ No newline at end of file
+#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp
index 1c75cdc427..f7b706902b 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.cpp
+++ b/src/runtime/CL/mlgo/HeuristicTree.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/runtime/CL/mlgo/HeuristicTree.h"
+
#include "arm_compute/core/Log.h"
#include "support/Cast.h"
@@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond)
// PRE: all features and ConditionalOps are valid
constexpr float eps = 0.0001f;
// Calculate all secondary features
- std::vector<std::pair<std::string, float>> cond_values
- {
- { "m", static_cast<float>(shape.m) },
- { "n", static_cast<float>(shape.n) },
- { "k", static_cast<float>(shape.k) },
- { "b", static_cast<float>(shape.b) },
- { "r_mn", static_cast<float>(shape.m) / shape.n },
- { "r_mk", static_cast<float>(shape.m) / shape.k },
- { "r_nk", static_cast<float>(shape.n) / shape.k },
- { "r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k) },
- { "workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0 }
- };
- auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(),
- [&cond](decltype(*cond_values.begin()) it)
- {
- return it.first == cond.feature;
- });
+ std::vector<std::pair<std::string, float>> cond_values{
+ {"m", static_cast<float>(shape.m)},
+ {"n", static_cast<float>(shape.n)},
+ {"k", static_cast<float>(shape.k)},
+ {"b", static_cast<float>(shape.b)},
+ {"r_mn", static_cast<float>(shape.m) / shape.n},
+ {"r_mk", static_cast<float>(shape.m) / shape.k},
+ {"r_nk", static_cast<float>(shape.n) / shape.k},
+ {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)},
+ {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}};
+ auto cond_value_pair_it =
+ std::find_if(cond_values.begin(), cond_values.end(),
+ [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; });
ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end());
const float cond_value = cond_value_pair_it->second;
- switch(cond.op)
+ switch (cond.op)
{
case ConditionalOp::LT:
{
@@ -92,13 +89,12 @@ constexpr size_t HeuristicTree::_max_num_nodes;
constexpr size_t HeuristicTree::_max_query_depth;
constexpr HeuristicTree::NodeID HeuristicTree::_root;
-HeuristicTree::HeuristicTree()
- : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
+HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
{
}
HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type)
- : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{}
+ : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{}
{
}
@@ -108,16 +104,17 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
// Root ID = 0;
auto cur_node = _tree.at(_root).get();
size_t depth = 0;
- while(cur_node->type() != NodeType::Leaf)
+ while (cur_node->type() != NodeType::Leaf)
{
- if(depth > _max_query_depth)
+ if (depth > _max_query_depth)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?",
+ _max_query_depth);
return std::make_pair(false, T{});
}
ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType");
auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
- if(evaluate(shape, br_node->condition))
+ if (evaluate(shape, br_node->condition))
{
cur_node = _tree.at(br_node->true_node).get();
}
@@ -135,12 +132,12 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
template <typename T>
bool HeuristicTree::add_leaf(NodeID id, T val)
{
- if(_tree.size() >= _max_num_nodes)
+ if (_tree.size() >= _max_num_nodes)
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
return false;
}
- if(_tree.find(id) != _tree.end())
+ if (_tree.find(id) != _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
return false;
@@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val)
bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
{
- if(_tree.size() >= _max_num_nodes)
+ if (_tree.size() >= _max_num_nodes)
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
return false;
}
- const std::set<std::string> supported_features =
- {
- "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"
- };
- const auto orig_feature = cond.feature;
- std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c)
- {
- return std::tolower(c);
- });
- if(supported_features.find(cond.feature) == supported_features.end())
+ const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"};
+ const auto orig_feature = cond.feature;
+ std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(),
+ [](char c) { return std::tolower(c); });
+ if (supported_features.find(cond.feature) == supported_features.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str());
return false;
}
- if(_tree.find(id) != _tree.end())
+ if (_tree.find(id) != _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
return false;
@@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID
bool HeuristicTree::check_if_structurally_correct() const
{
std::set<NodeID> visited;
- std::deque<NodeID> to_visit{ _root };
+ std::deque<NodeID> to_visit{_root};
- while(!to_visit.empty())
+ while (!to_visit.empty())
{
auto id = to_visit.front();
to_visit.pop_front();
- if(_tree.find(id) == _tree.end())
+ if (_tree.find(id) == _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id);
return false;
}
auto not_seen_before = visited.insert(id);
- if(!not_seen_before.second)
+ if (!not_seen_before.second)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops");
return false;
}
auto cur_node = _tree.at(id).get();
- if(cur_node->type() == NodeType::Branch)
+ if (cur_node->type() == NodeType::Branch)
{
auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
to_visit.push_back(br_node->true_node);
to_visit.push_back(br_node->false_node);
}
}
- if(visited.size() != _tree.size())
+ if (visited.size() != _tree.size())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes");
return false;
@@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const
bool HeuristicTree::check()
{
- if(_tree.empty())
+ if (_tree.empty())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered");
return false;
}
- if(_tree.find(_root) == _tree.end())
+ if (_tree.find(_root) == _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root);
return false;
@@ -237,7 +229,8 @@ template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shap
/** Explicit template instantiation @relates HeuristicTree */
template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const;
/** Explicit template instantiation @relates HeuristicTree */
-template std::pair<bool, GEMMConfigReshapedOnlyRHS> HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
+template std::pair<bool, GEMMConfigReshapedOnlyRHS>
+HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
/** Explicit template instantiation @relates HeuristicTree */
template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const;
diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h
index d5c7de2215..a4f8c116b9 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.h
+++ b/src/runtime/CL/mlgo/HeuristicTree.h
@@ -25,6 +25,7 @@
#define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
#include "arm_compute/core/Types.h"
+
#include "src/runtime/CL/mlgo/Common.h"
#include <map>
@@ -84,7 +85,7 @@ public:
struct BranchNode : public Node
{
BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
- : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node }
+ : id{id}, condition{cond}, true_node{t_node}, false_node{f_node}
{
}
NodeType type() const override
@@ -100,8 +101,7 @@ public:
template <typename T>
struct LeafNode : public Node
{
- LeafNode(NodeID id, T val)
- : id{ id }, value{ val }
+ LeafNode(NodeID id, T val) : id{id}, value{val}
{
}
NodeType type() const override
@@ -177,22 +177,22 @@ public:
bool check();
private:
- static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query
- static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree
- static constexpr NodeID _root{ 0 }; // Root tree ID
+ static constexpr size_t _max_query_depth{1000}; // Maximum depth of query
+ static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree
+ static constexpr NodeID _root{0}; // Root tree ID
private:
bool check_if_structurally_correct() const;
private:
- TreeID _id; /**< Heuristic tree ID */
- HeuristicType _heuristic_type; /**< Heuristic type */
- std::string _ip_target; /**< IP target associated with the tree */
- DataType _data_type; /**< Data type associated with the tree */
- std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */
+ TreeID _id; /**< Heuristic tree ID */
+ HeuristicType _heuristic_type; /**< Heuristic type */
+ std::string _ip_target; /**< IP target associated with the tree */
+ DataType _data_type; /**< Data type associated with the tree */
+ std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */
};
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
index 80f3bb85e9..aed46cd80f 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
@@ -24,6 +24,7 @@
#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
#include "arm_compute/core/Log.h"
+
#include "src/runtime/CL/mlgo/MLGOParser.h"
#include "src/runtime/CL/mlgo/Utils.h"
@@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs)
}
bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs)
{
- return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs,
- rhs.export_cl_image);
+ return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) ==
+ std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
}
bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs)
{
- return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0,
- rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
+ return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs,
+ lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs,
+ rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
}
constexpr size_t MLGOHeuristics::_max_num_trees;
-MLGOHeuristics::MLGOHeuristics()
- : _indices{}, _trees{}, _tree_valid{}, _valid{ false }
+MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false}
{
}
@@ -59,71 +60,74 @@ std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) co
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str());
const auto invalid = GEMMType::RESHAPED;
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMType>(shape_query);
}
std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.",
+ to_string(query).c_str());
const auto invalid = GEMMConfigNative{};
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMConfigNative>(shape_query);
}
std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.",
+ to_string(query).c_str());
const auto invalid = GEMMConfigReshapedOnlyRHS{};
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query);
}
std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.",
+ to_string(query).c_str());
const auto invalid = GEMMConfigReshaped{};
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMConfigReshaped>(shape_query);
}
@@ -131,14 +135,14 @@ std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(c
bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
{
bool status;
- HeuristicTree *tree{ nullptr };
+ HeuristicTree *tree{nullptr};
std::tie(status, tree) = get_heuristic_tree(id);
- if(!status)
+ if (!status)
{
return status;
}
status = tree->check();
- if(!status)
+ if (!status)
{
return status;
}
@@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
bool MLGOHeuristics::check_all() const
{
// Tree validities are already checked and cached.
- bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v)
- {
- return !v.second;
- })
- == _tree_valid.end();
- if(!all_trees_are_checked)
+ bool all_trees_are_checked =
+ std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end();
+ if (!all_trees_are_checked)
{
- ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo");
+ ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each "
+ "tree is completed. This could also indicate there are no trees in the dotmlgo");
return false;
}
@@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const
std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id)
{
- if(_indices.find(id) == _indices.end())
+ if (_indices.find(id) == _indices.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id);
return std::make_pair(false, nullptr);
}
const auto index = _indices[id];
- if(_trees.find(index) == _trees.end())
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
return std::make_pair(false, nullptr);
@@ -186,7 +188,7 @@ std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTre
bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
{
- if(_indices.size() >= _max_num_trees)
+ if (_indices.size() >= _max_num_trees)
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees);
return false;
@@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
// PRE: correctness of t is guaranteed by the tree construction process
// Ensure unique id
const auto id = t.id();
- if(_indices.find(id) != _indices.end())
+ if (_indices.find(id) != _indices.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id);
return false;
@@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
// Ensure unique index
const auto index = t.index();
- if(_trees.find(index) != _trees.end())
+ if (_trees.find(index) != _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists");
return false;
@@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
std::ifstream fs;
fs.exceptions(std::ifstream::badbit);
fs.open(filename, std::ios::in);
- if(!fs.is_open())
+ if (!fs.is_open())
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead",
+ filename.c_str());
return _valid = false;
}
return reload_from_stream(fs);
@@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
bool MLGOHeuristics::reload_from_stream(std::istream &in)
{
auto parsed = parser::parse_mlgo(in);
- if(!parsed.first)
+ if (!parsed.first)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead");
return _valid = false;
@@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in)
}
} // namespace mlgo
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h
index aa21225959..6a491c5503 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.h
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.h
@@ -135,16 +135,16 @@ public:
bool check_all() const;
private:
- static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/
+ static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/
private:
// There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree
std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices; /**< A mapping from TreeID to Index */
std::map<HeuristicTree::Index, HeuristicTree> _trees; /**< A mapping from Index to HeuristicTree */
std::map<HeuristicTree::TreeID, bool> _tree_valid; /**< Result cache of the tree validity checks */
- bool _valid; /**< Overall validity */
+ bool _valid; /**< Overall validity */
};
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp
index 625739e450..893daf2ed9 100644
--- a/src/runtime/CL/mlgo/MLGOParser.cpp
+++ b/src/runtime/CL/mlgo/MLGOParser.cpp
@@ -22,19 +22,21 @@
* SOFTWARE.
*/
#include "src/runtime/CL/mlgo/MLGOParser.h"
+
#include "arm_compute/core/Log.h"
+
#include "src/runtime/CL/mlgo/Utils.h"
#include <sstream>
#define CHECK(parser_expr, valid_var) \
(parser_expr); \
- if(!valid_var) \
+ if (!valid_var) \
return;
#define CHECK_DEFAULT(parser_expr, valid_var, default_val) \
(parser_expr); \
- if(!valid_var) \
+ if (!valid_var) \
return default_val;
#ifdef ARM_COMPUTE_LOGGING_ENABLED
@@ -53,8 +55,7 @@
valid_var = false; \
return default_val;
-#define LOG_TOKEN_POS(tokens, pos_var) \
- const auto pos_var = tokens.current_pos();
+#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos();
#else // ARM_COMPUTE_LOGGING_ENABLED
@@ -73,19 +74,12 @@ namespace
{
void ltrim(std::string &str)
{
- str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch)
- {
- return !std::isspace(ch);
- }));
+ str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); }));
}
void rtrim(std::string &str)
{
- str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch)
- {
- return !std::isspace(ch);
- }).base(),
- str.end());
+ str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end());
}
void trim(std::string &str)
@@ -109,7 +103,7 @@ enum class ComparatorType
};
TokenStream::TokenStream(std::istream &s, const std::string &delims)
- : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{}
+ : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{}
{
read();
}
@@ -125,7 +119,7 @@ Token TokenStream::take()
ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
Token t = _tokens.front();
_tokens.pop_front();
- if(_tokens.empty())
+ if (_tokens.empty())
{
read();
}
@@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i)
ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead");
// NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end
- while(_istream && _tokens.size() <= i)
+ while (_istream && _tokens.size() <= i)
{
read();
}
@@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i)
void advance(CharPosition &pos, char ch)
{
- if(ch == '\n')
+ if (ch == '\n')
{
pos.ln += 1;
pos.col = 0;
@@ -167,17 +161,16 @@ void TokenStream::read()
do
{
// Reached eof
- if(!_istream.get(ch))
+ if (!_istream.get(ch))
{
- if(!reached_end())
+ if (!reached_end())
{
_tokens.emplace_back(TokenType::End, "", _lookahead_pos);
}
return;
}
advance(_lookahead_pos, ch);
- }
- while(std::isspace(ch) || is_delim(ch));
+ } while (std::isspace(ch) || is_delim(ch));
// Read chars until we hit a delim or eof
auto orig_pos = _lookahead_pos;
auto tok = recognize_tok(ch);
@@ -190,41 +183,41 @@ void TokenStream::read()
Token TokenStream::recognize_tok(char ch)
{
- if(ch == '[')
+ if (ch == '[')
{
- return Token{ TokenType::L_List, "", _lookahead_pos };
+ return Token{TokenType::L_List, "", _lookahead_pos};
}
- else if(ch == ']')
+ else if (ch == ']')
{
- return Token{ TokenType::R_List, "", _lookahead_pos };
+ return Token{TokenType::R_List, "", _lookahead_pos};
}
- else if(ch == '.')
+ else if (ch == '.')
{
- return float_after_dp_st(std::string{ ch });
+ return float_after_dp_st(std::string{ch});
}
- else if(std::isdigit(ch))
+ else if (std::isdigit(ch))
{
- return num_st(std::string{ ch });
+ return num_st(std::string{ch});
}
else
{
- return text_st(std::string{ ch });
+ return text_st(std::string{ch});
}
}
Token TokenStream::num_st(std::string value)
{
char ch{};
- while(_istream.get(ch))
+ while (_istream.get(ch))
{
advance(_lookahead_pos, ch);
- if(ch == '.')
+ if (ch == '.')
{
return float_after_dp_st(value + ch);
}
- else if(!std::isdigit(ch))
+ else if (!std::isdigit(ch))
{
- if(!is_delim(ch) && !std::isspace(ch))
+ if (!is_delim(ch) && !std::isspace(ch))
{
rewind(_lookahead_pos);
_istream.unget();
@@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value)
}
value += ch;
}
- return Token{ TokenType::Int, value, _lookahead_pos };
+ return Token{TokenType::Int, value, _lookahead_pos};
}
Token TokenStream::float_after_dp_st(std::string value)
{
char ch{};
- while(_istream.get(ch))
+ while (_istream.get(ch))
{
advance(_lookahead_pos, ch);
- if(!std::isdigit(ch))
+ if (!std::isdigit(ch))
{
- if(!is_delim(ch) && !std::isspace(ch))
+ if (!is_delim(ch) && !std::isspace(ch))
{
rewind(_lookahead_pos);
_istream.unget();
@@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value)
}
value += ch;
}
- return Token{ TokenType::Float, value, _lookahead_pos };
+ return Token{TokenType::Float, value, _lookahead_pos};
}
Token TokenStream::text_st(std::string value)
{
char ch{};
- while(_istream.get(ch))
+ while (_istream.get(ch))
{
advance(_lookahead_pos, ch);
- if(is_delim(ch))
+ if (is_delim(ch))
{
break;
}
- if(ch == '[' || ch == ']')
+ if (ch == '[' || ch == ']')
{
rewind(_lookahead_pos);
_istream.unget();
@@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value)
}
value += ch;
}
- return Token{ TokenType::Text, value, _lookahead_pos };
+ return Token{TokenType::Text, value, _lookahead_pos};
}
bool TokenStream::reached_end() const
@@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::End)
+ if (tok.type != TokenType::End)
{
FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream");
}
@@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Int)
+ if (tok.type != TokenType::Int)
{
FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token");
}
@@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Int)
+ if (tok.type != TokenType::Int)
{
FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token");
}
@@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
int val = CHECK_DEFAULT(int_val(in, valid), valid, 0);
- if(val < 0)
+ if (val < 0)
{
FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token");
}
@@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Float)
+ if (tok.type != TokenType::Float)
{
FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token");
}
@@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Text || tok.value.empty())
+ if (tok.type != TokenType::Text || tok.value.empty())
{
FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token");
}
@@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid)
bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
{
auto tok = in.peek();
- if(tok.type == TokenType::Text && tok.value == c_str)
+ if (tok.type == TokenType::Text && tok.value == c_str)
{
- if(take)
+ if (take)
{
in.take();
}
@@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
void expect_text(TokenStream &in, const std::string &str, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(!accept_text(in, str))
+ if (!accept_text(in, str))
{
FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str);
}
@@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid)
bool accept_l_list(TokenStream &in)
{
auto tok = in.peek();
- if(tok.type == TokenType::L_List)
+ if (tok.type == TokenType::L_List)
{
in.take();
return true;
@@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in)
void expect_l_list(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(!accept_l_list(in))
+ if (!accept_l_list(in))
{
FAIL_WITH_MSG(valid, pos, "Expect '['");
}
@@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid)
bool accept_r_list(TokenStream &in)
{
auto tok = in.peek();
- if(tok.type == TokenType::R_List)
+ if (tok.type == TokenType::R_List)
{
in.take();
return true;
@@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in)
void expect_r_list(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(!accept_r_list(in))
+ if (!accept_r_list(in))
{
FAIL_WITH_MSG(valid, pos, "Expect ']'");
}
@@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid)
ConditionalOp conditional_op(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "<="))
+ if (accept_text(in, "<="))
{
return ConditionalOp::LE;
}
- else if(accept_text(in, ">="))
+ else if (accept_text(in, ">="))
{
return ConditionalOp::GE;
}
- else if(accept_text(in, "=="))
+ else if (accept_text(in, "=="))
{
return ConditionalOp::EQ;
}
- else if(accept_text(in, "<"))
+ else if (accept_text(in, "<"))
{
return ConditionalOp::LT;
}
- else if(accept_text(in, ">"))
+ else if (accept_text(in, ">"))
{
return ConditionalOp::GT;
}
@@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid)
{
CHECK(expect_text(in, "ip-type", valid), valid);
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "gpu"))
+ if (accept_text(in, "gpu"))
{
;
}
- else if(accept_text(in, "cpu"))
+ else if (accept_text(in, "cpu"))
{
;
}
@@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid)
DataType data_type(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "f16"))
+ if (accept_text(in, "f16"))
{
return DataType::F16;
}
- else if(accept_text(in, "f32"))
+ else if (accept_text(in, "f32"))
{
return DataType::F32;
}
- else if(accept_text(in, "qasymm8"))
+ else if (accept_text(in, "qasymm8"))
{
return DataType::QASYMM8;
}
@@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid)
ComparatorType comparator_type(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "var"))
+ if (accept_text(in, "var"))
{
return ComparatorType::Var;
}
- else if(accept_text(in, "num"))
+ else if (accept_text(in, "num"))
{
return ComparatorType::Num;
}
- else if(accept_text(in, "enum"))
+ else if (accept_text(in, "enum"))
{
return ComparatorType::Enum;
}
@@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid)
HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "gemm-type", take))
+ if (accept_text(in, "gemm-type", take))
{
return HeuristicType::GEMM_Type;
}
- else if(accept_text(in, "gemm-config-native", take))
+ else if (accept_text(in, "gemm-config-native", take))
{
return HeuristicType::GEMM_Config_Native;
}
- else if(accept_text(in, "gemm-config-reshaped-only-rhs", take))
+ else if (accept_text(in, "gemm-config-reshaped-only-rhs", take))
{
return HeuristicType::GEMM_Config_Reshaped_Only_RHS;
}
- else if(accept_text(in, "gemm-config-reshaped", take))
+ else if (accept_text(in, "gemm-config-reshaped", take))
{
return HeuristicType::GEMM_Config_Reshaped;
}
@@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
{
LOG_TOKEN_POS(in, pos);
auto ht = CHECK(heuristic_type(in, valid, false), valid);
- if(ht != expected_ht)
+ if (ht != expected_ht)
{
FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type");
}
@@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
GEMMType gemm_type(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "native"))
+ if (accept_text(in, "native"))
{
return GEMMType::NATIVE;
}
- else if(accept_text(in, "reshaped-only-rhs"))
+ else if (accept_text(in, "reshaped-only-rhs"))
{
return GEMMType::RESHAPED_ONLY_RHS;
}
- else if(accept_text(in, "reshaped"))
+ else if (accept_text(in, "reshaped"))
{
return GEMMType::RESHAPED;
}
@@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid)
const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
- return GEMMConfigNative{ m0, n0, k0 };
+ return GEMMConfigNative{m0, n0, k0};
}
GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid)
@@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v
const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
- return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex };
+ return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex};
}
GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
@@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
- return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex };
+ return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex};
}
void gpu_priority(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "best-performance"))
+ if (accept_text(in, "best-performance"))
{
;
}
- else if(accept_text(in, "best-memory-usage"))
+ else if (accept_text(in, "best-memory-usage"))
{
;
}
@@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid)
void gpu_behavior(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "static"))
+ if (accept_text(in, "static"))
{
;
}
- else if(accept_text(in, "dynamic"))
+ else if (accept_text(in, "dynamic"))
{
;
}
@@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid)
void free_vars(TokenStream &in, bool &valid)
{
CHECK(expect_l_list(in, valid), valid);
- while(!accept_r_list(in))
+ while (!accept_r_list(in))
{
CHECK(text_val(in, valid), valid);
}
@@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid)
void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid)
{
CHECK(expect_text(in, "<heuristics-table>", valid), valid);
- while(!accept_text(in, "</heuristics-table>"))
+ while (!accept_text(in, "</heuristics-table>"))
{
CHECK(heuristics_table_entry(in, h, valid), valid);
}
@@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid)
const auto c_o = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val);
const auto r_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val);
const auto r_v = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val);
- if(l_t != ComparatorType::Var || r_t != ComparatorType::Num)
+ if (l_t != ComparatorType::Var || r_t != ComparatorType::Num)
{
- FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
+ FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos,
+ "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
}
- return Condition{ l_v, c_o, r_v };
+ return Condition{l_v, c_o, r_v};
}
void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
@@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
CHECK(expect_text(in, "<heuristic", valid), valid);
const auto tree_id = CHECK(uint_val(in, valid), valid);
CHECK(expect_text(in, ">", valid), valid);
- HeuristicTree *t = nullptr;
- std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid);
+ HeuristicTree *t = nullptr;
+ std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid);
const HeuristicType t_heuristic_type = std::get<0>(t->index());
- while(!accept_text(in, "</heuristic>"))
+ while (!accept_text(in, "</heuristic>"))
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "b"))
+ if (accept_text(in, "b"))
{
// Branch node
const auto id = CHECK(uint_val(in, valid), valid);
@@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
const auto f_id = CHECK(uint_val(in, valid), valid);
valid = CHECK(t->add_branch(id, cond, t_id, f_id), valid);
}
- else if(accept_text(in, "l"))
+ else if (accept_text(in, "l"))
{
// Leaf node
const auto id = CHECK(uint_val(in, valid), valid);
@@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
// heuristic table). For now it remains as a step for validation.
LOG_TOKEN_POS(in, pos);
CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid);
- switch(t_heuristic_type)
+ switch (t_heuristic_type)
{
case HeuristicType::GEMM_Type:
{
@@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid)
MLGOHeuristics h;
CHECK_DEFAULT(header(in, valid), valid, h);
CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h);
- while(accept_text(in, "<heuristic", false))
+ while (accept_text(in, "<heuristic", false))
{
CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h);
}
@@ -809,4 +803,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in)
#undef CHECK
#undef CHECK_DEFAULT
#undef FAIL_WITH_MSG
-#undef FAIL_WITH_MSG_DEFAULT \ No newline at end of file
+#undef FAIL_WITH_MSG_DEFAULT
diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h
index 49d8b9c644..cffce8d6a1 100644
--- a/src/runtime/CL/mlgo/MLGOParser.h
+++ b/src/runtime/CL/mlgo/MLGOParser.h
@@ -98,15 +98,14 @@ struct CharPosition
return ln == other.ln && col == other.col;
}
- size_t ln{ 0 };
- size_t col{ 0 };
+ size_t ln{0};
+ size_t col{0};
};
/** Token */
struct Token
{
- Token(TokenType t, std::string v, CharPosition pos)
- : type{ t }, value{ v }, pos{ pos }
+ Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos}
{
}
@@ -196,4 +195,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in);
} // namespace parser
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp
index 81d418c28e..c7e0100b3c 100644
--- a/src/runtime/CL/mlgo/Utils.cpp
+++ b/src/runtime/CL/mlgo/Utils.cpp
@@ -43,40 +43,38 @@ inline std::string to_str(const T &val)
std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config)
{
return os << "Native:{"
- << "m0: " << config.m0 << ", "
- << "n0: " << config.n0 << ", "
- << "k0: " << config.k0 << ", "
- << "}";
+ << "m0: " << config.m0 << ", "
+ << "n0: " << config.n0 << ", "
+ << "k0: " << config.k0 << ", "
+ << "}";
}
std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config)
{
return os << "ReshapedOnlyRHS:{"
- << "m0: " << config.m0 << ", "
- << "n0: " << config.n0 << ", "
- << "k0: " << config.k0 << ", "
- << "h0: " << config.h0 << ", "
- << "interleave_rhs: " << config.interleave_rhs << ", "
- << "transpose_rhs: " << config.transpose_rhs << ", "
- << "export_cl_image: " << config.export_cl_image
- << "}";
+ << "m0: " << config.m0 << ", "
+ << "n0: " << config.n0 << ", "
+ << "k0: " << config.k0 << ", "
+ << "h0: " << config.h0 << ", "
+ << "interleave_rhs: " << config.interleave_rhs << ", "
+ << "transpose_rhs: " << config.transpose_rhs << ", "
+ << "export_cl_image: " << config.export_cl_image << "}";
}
std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config)
{
return os << "Reshaped:{"
- << "m0: " << config.m0 << ", "
- << "n0: " << config.n0 << ", "
- << "k0: " << config.k0 << ", "
- << "v0: " << config.v0 << ", "
- << "h0: " << config.h0 << ", "
- << "interleave_lhs: " << config.interleave_lhs << ", "
- << "interleave_rhs: " << config.interleave_rhs << ", "
- << "transpose_rhs: " << config.transpose_rhs << ", "
- << "export_cl_image: " << config.export_cl_image
- << "}";
+ << "m0: " << config.m0 << ", "
+ << "n0: " << config.n0 << ", "
+ << "k0: " << config.k0 << ", "
+ << "v0: " << config.v0 << ", "
+ << "h0: " << config.h0 << ", "
+ << "interleave_lhs: " << config.interleave_lhs << ", "
+ << "interleave_rhs: " << config.interleave_rhs << ", "
+ << "transpose_rhs: " << config.transpose_rhs << ", "
+ << "export_cl_image: " << config.export_cl_image << "}";
}
std::ostream &operator<<(std::ostream &os, HeuristicType ht)
{
- switch(ht)
+ switch (ht)
{
case HeuristicType::GEMM_Type:
{
@@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht)
}
std::ostream &operator<<(std::ostream &os, DataType dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::F32:
{
@@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos)
} // namespace mlgo
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h
index c634a887e9..73b537f476 100644
--- a/src/runtime/CL/mlgo/Utils.h
+++ b/src/runtime/CL/mlgo/Utils.h
@@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht);
std::ostream &operator<<(std::ostream &os, DataType dt);
std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index);
std::ostream &operator<<(std::ostream &os, const Query &query);
-std::string to_string(const GEMMConfigNative &config);
-std::string to_string(const GEMMConfigReshapedOnlyRHS &config);
-std::string to_string(const GEMMConfigReshaped &config);
-std::string to_string(const Query &query);
+std::string to_string(const GEMMConfigNative &config);
+std::string to_string(const GEMMConfigReshapedOnlyRHS &config);
+std::string to_string(const GEMMConfigReshaped &config);
+std::string to_string(const Query &query);
namespace parser
{
std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
@@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_UTILS_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 6f3e32491a..5e3907f1ea 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -27,20 +27,20 @@ namespace arm_compute
{
namespace cl_tuner
{
-constexpr unsigned int max_lws_supported_x{ 64u };
-constexpr unsigned int max_lws_supported_y{ 32u };
-constexpr unsigned int max_lws_supported_z{ 32u };
+constexpr unsigned int max_lws_supported_x{64u};
+constexpr unsigned int max_lws_supported_y{32u};
+constexpr unsigned int max_lws_supported_z{32u};
/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */
class CLTuningParametersList : public ICLTuningParametersList
{
protected:
/* Shape of 4-D search space */
- TensorShape search_space_shape{ 0, 0, 0, 0 };
- std::vector<unsigned int> _lws_x{ 0 };
- std::vector<unsigned int> _lws_y{ 0 };
- std::vector<unsigned int> _lws_z{ 0 };
- std::vector<int> _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+ TensorShape search_space_shape{0, 0, 0, 0};
+ std::vector<unsigned int> _lws_x{0};
+ std::vector<unsigned int> _lws_y{0};
+ std::vector<unsigned int> _lws_z{0};
+ std::vector<int> _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units.
The value is in the range [-31,+31].
When 0, the runtime-selected wbs used is unmodified. */
@@ -116,7 +116,8 @@ private:
* @param[in] lws_max Max LWS value allowed to be tested
* @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
*/
- void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
+ void
+ initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
};
/** A minimal subset of LWS values that only have 1,2 and 4/8 */
@@ -170,9 +171,9 @@ CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDR
search_space_shape[1] = lws_y_max;
search_space_shape[2] = lws_z_max;
search_space_shape[3] = 1;
- if(tuning_info.tune_wbsm)
+ if (tuning_info.tune_wbsm)
{
- _wbsm = { -3, -2, -1, 0, 1, 2, 3 };
+ _wbsm = {-3, -2, -1, 0, 1, 2, 3};
search_space_shape[3] = _wbsm.size();
}
}
@@ -194,26 +195,31 @@ CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gw
_lws_x = {};
_lws_y = {};
_lws_z = {};
- initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
- initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_x, gws[0], lws_x_max,
+ gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_y, gws[1], lws_y_max,
+ gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
search_space_shape[0] = _lws_x.size();
search_space_shape[1] = _lws_y.size();
search_space_shape[2] = _lws_z.size();
search_space_shape[3] = 1;
- if(tuning_info.tune_wbsm)
+ if (tuning_info.tune_wbsm)
{
- _wbsm = { -2, -1, 0, 1, 2 };
+ _wbsm = {-2, -1, 0, 1, 2};
search_space_shape[3] = _wbsm.size();
}
}
-void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws,
+ unsigned int gws,
+ unsigned int lws_max,
+ bool mod_let_one)
{
lws.push_back(1);
- for(unsigned int i = 2; i <= lws_max; ++i)
+ for (unsigned int i = 2; i <= lws_max; ++i)
{
// Power of two condition
const bool is_power_of_two = (i & (i - 1)) == 0;
@@ -221,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
// Condition for the module accordingly with the mod_let_one flag
const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
- if(mod_cond || is_power_of_two)
+ if (mod_cond || is_power_of_two)
{
lws.push_back(i);
}
@@ -246,9 +252,9 @@ CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws,
search_space_shape[1] = _lws_y.size();
search_space_shape[2] = _lws_z.size();
search_space_shape[3] = 1;
- if(tuning_info.tune_wbsm)
+ if (tuning_info.tune_wbsm)
{
- _wbsm = { -1, 0, 1 };
+ _wbsm = {-1, 0, 1};
search_space_shape[3] = _wbsm.size();
}
}
@@ -257,7 +263,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
{
lws.push_back(1);
- for(unsigned int i = 2; i <= lws_max; i *= 4)
+ for (unsigned int i = 2; i <= lws_max; i *= 4)
{
lws.push_back(i);
}
@@ -265,7 +271,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
{
- switch(tuning_info.tuner_mode)
+ switch (tuning_info.tuner_mode)
{
case CLTunerMode::EXHAUSTIVE:
return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);