diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/runtime/CL | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/runtime/CL')
117 files changed, 4282 insertions, 2612 deletions
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp index e06ef3d37d..b4545b93bf 100644 --- a/src/runtime/CL/CLBufferAllocator.cpp +++ b/src/runtime/CL/CLBufferAllocator.cpp @@ -35,7 +35,8 @@ namespace arm_compute void *CLBufferAllocator::allocate(size_t size, size_t alignment) { ARM_COMPUTE_UNUSED(alignment); - cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) }; + cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, + nullptr, nullptr)}; return static_cast<void *>(buf); } diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp index 7168259fcd..d680dc08bb 100644 --- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp +++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp @@ -27,8 +27,7 @@ namespace arm_compute { -CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() - : _heuristics(std::make_unique<mlgo::MLGOHeuristics>()) +CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>()) { } CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default; diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp index 5b4bbbcde0..eb28ecbf8d 100644 --- a/src/runtime/CL/CLHelpers.cpp +++ b/src/runtime/CL/CLHelpers.cpp @@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void * @return A pointer to the context properties which can be used to create an opencl context */ -void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop) +void initialise_context_properties(const cl::Platform &platform, + const cl::Device &device, + std::array<cl_context_properties, 7> &prop) { ARM_COMPUTE_UNUSED(device); #if defined(ARM_COMPUTE_ASSERTS_ENABLED) // Query devices in the context for cl_arm_printf support - if(arm_compute::device_supports_extension(device, "cl_arm_printf")) + if (arm_compute::device_supports_extension(device, "cl_arm_printf")) { // Create a cl_context with a printf_callback and user specified buffer size. - std::array<cl_context_properties, 7> properties_printf = - { + std::array<cl_context_properties, 7> properties_printf = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()), // Enable a printf callback function for this context. CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback), // Request a minimum printf buffer size of 4MB for devices in the // context that support this extension. - CL_PRINTF_BUFFERSIZE_ARM, 0x1000, - 0 - }; + CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0}; prop = properties_printf; } else #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED) { - std::array<cl_context_properties, 3> properties = - { - CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()), - 0 - }; + std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM, + reinterpret_cast<cl_context_properties>(platform()), 0}; std::copy(properties.begin(), properties.end(), prop.begin()); }; } @@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) cl::Platform::get(&platforms); ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform"); - cl::Platform selected_platform{ nullptr }; + cl::Platform selected_platform{nullptr}; // If the user has selected the Native platform, return the first available. - switch(cl_backend_type) + switch (cl_backend_type) { case CLBackendType::Native: selected_platform = platforms[0]; break; case CLBackendType::Clvk: - for(auto p : platforms) + for (auto p : platforms) { std::string res = p.getInfo<CL_PLATFORM_NAME>(); - if(res.find("clvk") != std::string::npos) + if (res.find("clvk") != std::string::npos) { selected_platform = p; break; @@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) ARM_COMPUTE_ERROR("Unsupported backend type"); } - if(!selected_platform()) + if (!selected_platform()) { ARM_COMPUTE_ERROR("No valid platform found"); } @@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type) return selected_platform; } -std::tuple<cl::Context, cl::Device, cl_int> -create_opencl_context_and_device(CLBackendType cl_backend_type) +std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type) { ARM_COMPUTE_ERROR_ON(!opencl_is_available()); cl::Platform p = select_preferable_platform(cl_backend_type); @@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) std::vector<cl::Device> platform_devices; p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices); ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device"); - device = platform_devices[0]; - cl_int err = CL_SUCCESS; - std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 }; + device = platform_devices[0]; + cl_int err = CL_SUCCESS; + std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0}; initialise_context_properties(p, device, properties); cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context"); @@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type) void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush) { ARM_COMPUTE_ERROR_ON_NULLPTR(kernel); - if(ctx) + if (ctx) { ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr); ctx->gpu_scheduler()->enqueue(*kernel, flush); diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp index a1743c56e6..c6ee6fde83 100644 --- a/src/runtime/CL/CLMemory.cpp +++ b/src/runtime/CL/CLMemory.cpp @@ -24,24 +24,22 @@ #include "arm_compute/runtime/CL/CLMemory.h" #include "arm_compute/core/Error.h" + #include "support/Cast.h" namespace arm_compute { -CLMemory::CLMemory() - : _region(nullptr), _region_owned(nullptr) +CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr) { } -CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) - : _region(nullptr), _region_owned(memory) +CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory) { _region_owned = memory; _region = _region_owned.get(); } -CLMemory::CLMemory(ICLMemoryRegion *memory) - : _region(memory), _region_owned(nullptr) +CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr) { _region = memory; } @@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region) _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region)); _region = _region_owned.get(); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp index 00f91a0ffb..835958b816 100644 --- a/src/runtime/CL/CLMemoryRegion.cpp +++ b/src/runtime/CL/CLMemoryRegion.cpp @@ -29,10 +29,7 @@ namespace arm_compute { ICLMemoryRegion::ICLMemoryRegion(size_t size) - : IMemoryRegion(size), - _ctx(CLScheduler::get().context()), - _mapping(nullptr), - _mem() + : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem() { } @@ -57,17 +54,15 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset, return nullptr; } -CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) - : ICLMemoryRegion(size) +CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size) { - if(_size != 0) + if (_size != 0) { _mem = cl::Buffer(CLScheduler::get().context(), flags, _size); } } -CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) - : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>()) +CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>()) { _mem = buffer; } @@ -102,10 +97,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q) ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment) : ICLMemoryRegion(size), _ptr(nullptr) { - if(size != 0) + if (size != 0) { _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment); - if(_ptr != nullptr) + if (_ptr != nullptr) { _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr); } @@ -114,7 +109,7 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a ICLSVMMemoryRegion::~ICLSVMMemoryRegion() { - if(_ptr != nullptr) + if (_ptr != nullptr) { try { @@ -125,7 +120,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion() _mem = cl::Buffer(); clSVMFree(_ctx.get(), _ptr); } - catch(...) + catch (...) { } } @@ -144,7 +139,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(_ptr == nullptr); - clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr); + clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, + nullptr); _mapping = _ptr; return _mapping; } @@ -163,7 +159,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking) { - if(blocking) + if (blocking) { clFinish(q.get()); } diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp index 075a544077..89d4520038 100644 --- a/src/runtime/CL/CLOperator.cpp +++ b/src/runtime/CL/CLOperator.cpp @@ -30,14 +30,13 @@ namespace arm_compute { namespace experimental { -ICLOperator::ICLOperator(IRuntimeContext *ctx) - : _kernel(), _ctx(ctx), _workspace() +ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace() { } void ICLOperator::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp index 5083b4b0c5..b426b8c304 100644 --- a/src/runtime/CL/CLRuntimeContext.cpp +++ b/src/runtime/CL/CLRuntimeContext.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -29,7 +30,10 @@ namespace arm_compute { CLRuntimeContext::CLRuntimeContext() - : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type() + : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), + _gpu_scheduler(_gpu_owned_scheduler.get()), + _symbols(), + _backend_type() { _symbols.load_default(); auto ctx_dev_err = create_opencl_context_and_device(_backend_type); diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index b7a4dff45d..f0a42f55fd 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event() void CLScheduler::tune_kernel_static(ICLKernel &kernel) { - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { _cl_tuner->tune_kernel_static(kernel); } @@ -95,8 +96,16 @@ bool CLScheduler::is_initialised() const std::once_flag CLScheduler::_initialize_symbols; CLScheduler::CLScheduler() - : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native), _job_chaining_enabled(true), - _job_chaining_size(1), _job_chaining_count(0) + : _context(), + _queue(), + _target(GPUTarget::MIDGARD), + _is_initialised(false), + _cl_tuner(nullptr), + _gemm_heuristics(nullptr), + _backend_type(CLBackendType::Native), + _job_chaining_enabled(true), + _job_chaining_size(1), + _job_chaining_count(0) { } @@ -107,9 +116,12 @@ CLScheduler &CLScheduler::get() return scheduler; } -void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h) +void CLScheduler::default_init_with_context(cl::Device &device, + cl::Context &ctx, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h) { - if(!_is_initialised) + if (!_is_initialised) { const std::string cl_kernels_folder("./cl_kernels/"); cl::CommandQueue queue = cl::CommandQueue(ctx, device); @@ -121,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) { - if(!_is_initialised) + if (!_is_initialised) { cl::Context ctx; cl::Device dev; @@ -151,7 +163,12 @@ void CLScheduler::set_context(cl::Context context) CLKernelLibrary::get().set_context(_context); } -void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +void CLScheduler::init(cl::Context context, + cl::CommandQueue queue, + const cl::Device &device, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h, + CLBackendType cl_backend_type) { set_context(std::move(context)); _queue = std::move(queue); @@ -164,21 +181,21 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush) { - ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, - "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ + ARM_COMPUTE_ERROR_ON_MSG( + !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); const bool inject_memory = !tensors.empty(); // Tune the kernel if the CLTuner has been provided - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel); } // Run kernel inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue); - if(_job_chaining_enabled) + if (_job_chaining_enabled) { ++_job_chaining_count; } @@ -188,9 +205,9 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f void CLScheduler::flush_queue(bool flush) { - if(_job_chaining_enabled) + if (_job_chaining_enabled) { - if(_job_chaining_count >= _job_chaining_size) + if (_job_chaining_count >= _job_chaining_size) { _job_chaining_count = 0; /* @@ -199,14 +216,14 @@ void CLScheduler::flush_queue(bool flush) the CPU activity for job-scheduling. For eg. job-chain size goes from 1, 2, 4, 8 and 16 */ - if(_job_chaining_size < 16) + if (_job_chaining_size < 16) { _job_chaining_size <<= 1; } _queue.flush(); } } - else if(flush) + else if (flush) { _queue.flush(); } diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp index 14936ae23c..ace820bbb7 100644 --- a/src/runtime/CL/CLSubTensor.cpp +++ b/src/runtime/CL/CLSubTensor.cpp @@ -29,12 +29,14 @@ using namespace arm_compute; -CLSubTensor::CLSubTensor() - : _parent(nullptr), _info() +CLSubTensor::CLSubTensor() : _parent(nullptr), _info() { } -CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent) +CLSubTensor::CLSubTensor(ICLTensor *parent, + const TensorShape &tensor_shape, + const Coordinates &coords, + bool extend_parent) : _parent(nullptr), _info() { ARM_COMPUTE_ERROR_ON(parent == nullptr); @@ -81,7 +83,7 @@ void CLSubTensor::unmap() uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking) { ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr); - if(_parent->buffer() == nullptr) + if (_parent->buffer() == nullptr) { _parent->map(q, blocking); } diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index f85b8ae777..e6457218c7 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr; std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment) { // Try fine-grain SVM - std::unique_ptr<ICLMemoryRegion> region = std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, - size, - alignment); + std::unique_ptr<ICLMemoryRegion> region = + std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment); // Try coarse-grain SVM in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment); } // Try legacy buffer memory in case of failure - if(region != nullptr && region->ptr() == nullptr) + if (region != nullptr && region->ptr() == nullptr) { region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size); } @@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset) * @param[in] qinfo Quantization info * @param[in] pad_size Pad size to use in case array needs to be padded for computation purposes */ -void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size) +void populate_quantization_info(CLFloatArray &scale, + CLInt32Array &offset, + const QuantizationInfo &qinfo, + size_t pad_size) { clear_quantization_arrays(scale, offset); @@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const const size_t element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type); scale = CLFloatArray(num_elements + pad_size); scale.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data()); + CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, + qinfo.scale().data()); - if(!qinfo.offset().empty()) + if (!qinfo.offset().empty()) { // Create offset array - const std::vector<int32_t> &qoffset = qinfo.offset(); - const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type); - offset = CLInt32Array(num_elements + pad_size); + const std::vector<int32_t> &qoffset = qinfo.offset(); + const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type); + offset = CLInt32Array(num_elements + pad_size); offset.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data()); + CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, + num_elements * offset_element_size, qinfo.offset().data()); } } } // namespace @@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext CLQuantization CLTensorAllocator::quantization() const { - return { &_scale, &_offset }; + return {&_scale, &_offset}; } uint8_t *CLTensorAllocator::data() @@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const void CLTensorAllocator::allocate() { // Allocate tensor backing memory - if(_associated_memory_group == nullptr) + if (_associated_memory_group == nullptr) { // Perform memory allocation - if(static_global_cl_allocator != nullptr) + if (static_global_cl_allocator != nullptr) { _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0)); } @@ -146,7 +150,7 @@ void CLTensorAllocator::allocate() } // Allocate and fill the quantization parameter arrays - if(is_data_type_quantized_per_channel(info().data_type())) + if (is_data_type_quantized_per_channel(info().data_type())) { const size_t pad_size = 0; populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size); @@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator) uint8_t *CLTensorAllocator::lock() { - if(_ctx) + if (_ctx) { return map(_ctx->gpu_scheduler()->queue(), true); } @@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock() void CLTensorAllocator::unlock() { ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr); - if(_ctx) + if (_ctx) { unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer())); } diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 445638f01f..0d62fe3afe 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h" + #include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" @@ -37,19 +38,23 @@ namespace arm_compute { CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) - : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info) + : real_clEnqueueNDRangeKernel(nullptr), + _tuning_params_table(), + _lws_table(), + _kernel_event(), + _tune_new_kernels(tune_new_kernels), + _tuning_info(tuning_info) { } struct CLTuner::IKernelData { - virtual ~IKernelData() = default; + virtual ~IKernelData() = default; virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0; }; struct DefaultKernelData : public CLTuner::IKernelData { - DefaultKernelData(ITensorPack &tensors) - : _tensors{ tensors } + DefaultKernelData(ITensorPack &tensors) : _tensors{tensors} { } ~DefaultKernelData() override = default; @@ -100,16 +105,17 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel) void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Get the configuration ID from the kernel and append GPU target name and number of available compute units - const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); + const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned - if(kernel.config_id() != arm_compute::default_config_id) + if (kernel.config_id() != arm_compute::default_config_id) { auto p = _tuning_params_table.find(config_id); - if(p == _tuning_params_table.end()) + if (p == _tuning_params_table.end()) { - if(_tune_new_kernels) + if (_tune_new_kernels) { // Find the optimal LWS for the kernel CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data); @@ -119,7 +125,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) // Set Local-Workgroup-Size kernel.set_lws_hint(opt_tuning_params.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(opt_tuning_params.get_wbsm()); } @@ -129,7 +135,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Set Local-Workgroup-Size kernel.set_lws_hint(p->second.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { kernel.set_wbsm_hint(p->second.get_wbsm()); } @@ -138,7 +144,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) } void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) { - DefaultKernelData data{ tensors }; + DefaultKernelData data{tensors}; do_tune_kernel_dynamic(kernel, &data); } @@ -154,7 +160,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat cl::CommandQueue queue_profiler; // Extract real OpenCL function to intercept - if(real_clEnqueueNDRangeKernel == nullptr) + if (real_clEnqueueNDRangeKernel == nullptr) { real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr; } @@ -165,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat // Check if we can use the OpenCL timer with the default queue cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>(); - if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + if ((props & CL_QUEUE_PROFILING_ENABLE) == 0) { // Set the queue for profiling queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); @@ -176,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat } // Start intercepting enqueues: - auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, cl_event * event) + auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, + const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { - if(this->kernel_event_is_set()) + if (this->kernel_event_is_set()) { // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues. return CL_SUCCESS; } cl_event tmp; - cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, + num_events_in_wait_list, event_wait_list, &tmp); // Set OpenCL event this->set_cl_kernel_event(tmp); - if(event != nullptr) + if (event != nullptr) { //return cl_event from the intercepted call clRetainEvent(tmp); @@ -209,9 +217,10 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() /// Please see COMPMID-5934 cl::NDRange gws = kernel.get_cached_gws(); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, - "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", - kernel.config_id().c_str(), to_string(gws).c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL( + arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(), + to_string(gws).c_str()); queue_profiler.finish(); @@ -224,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat // Construct the list of tuning parameters values to be tested based on the tuner mode. auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws); - for(size_t i = 0; i < tuning_list->size(); ++i) + for (size_t i = 0; i < tuning_list->size(); ++i) { CLTuningParams tuning_test = (*tuning_list)[i]; // Setting the lws @@ -234,19 +243,18 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat auto z = lws_test[2]; const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1); - if(invalid_lws) + if (invalid_lws) { continue; } kernel.set_lws_hint(lws_test); - if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) + if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) { cl_int wbsm_test = tuning_test.get_wbsm(); kernel.set_wbsm_hint(wbsm_test); } - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, - "[CLTuner] Trying LWS: %s, WBSM: %d", + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d", to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel @@ -260,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat _kernel_event = nullptr; // Check the execution time - if(diff < min_exec_time) + if (diff < min_exec_time) { min_exec_time = diff; opt_tuning_params.set_lws(tuning_test.get_lws()); - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { opt_tuning_params.set_wbsm(tuning_test.get_wbsm()); } @@ -292,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno); } std::string line; bool header_line = true; - while(!std::getline(fs, line).fail()) + while (!std::getline(fs, line).fail()) { - if(header_line) + if (header_line) { header_line = false; size_t pos_lws = line.find("lws"); size_t pos_wbsm = line.find("wbsm"); _tuning_info.tune_wbsm = false; - if(pos_lws != std::string::npos || pos_wbsm != std::string::npos) + if (pos_lws != std::string::npos || pos_wbsm != std::string::npos) { // The file has in the first line the parameters it has been tuned on - if(pos_wbsm != std::string::npos) + if (pos_wbsm != std::string::npos) { _tuning_info.tune_wbsm = true; } // Once the line with the tuning parameter is read we can // read the next one to start collecting the values - if(std::getline(fs, line).fail()) + if (std::getline(fs, line).fail()) { break; } @@ -324,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename) CLTuningParams tuning_params; size_t pos = line.find(";"); - if(pos == std::string::npos) + if (pos == std::string::npos) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } std::string kernel_id = line.substr(0, pos); line.erase(0, pos + 1); - if(!tuning_params.from_string(_tuning_info, line)) + if (!tuning_params.from_string(_tuning_info, line)) { ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } @@ -341,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename) bool CLTuner::save_to_file(const std::string &filename) const { - if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) + if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty()) { return false; } @@ -350,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const fs.open(filename, std::ios::out); std::string header_string = ""; header_string += "lws"; - if(_tuning_info.tune_wbsm) + if (_tuning_info.tune_wbsm) { - if(!header_string.empty()) + if (!header_string.empty()) { header_string += " "; } header_string += "wbsm"; } fs << header_string << std::endl; - for(auto const &kernel_data : _tuning_params_table) + for (auto const &kernel_data : _tuning_params_table) { CLTuningParams tun_pams(kernel_data.second); fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl; diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp index 4530537789..bc782c3a2c 100644 --- a/src/runtime/CL/ICLSimpleFunction.cpp +++ b/src/runtime/CL/ICLSimpleFunction.cpp @@ -26,15 +26,14 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" using namespace arm_compute; ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT - : _kernel(), - _border_handler(std::make_unique<CLFillBorderKernel>()), - _ctx(ctx) + : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx) { } diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp index da3d4850bf..294396c28a 100644 --- a/src/runtime/CL/Utils.cpp +++ b/src/runtime/CL/Utils.cpp @@ -35,20 +35,20 @@ namespace arm_compute void restore_program_cache_from_file(const std::string &filename) { std::ifstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - if(!CLScheduler::get().is_initialised()) + if (!CLScheduler::get().is_initialised()) { arm_compute::CLScheduler::get().default_init(); } - while(!cache_file.eof()) + while (!cache_file.eof()) { size_t name_len = 0; size_t binary_len = 0; cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t)); cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t)); - if(name_len == 0 || binary_len == 0) + if (name_len == 0 || binary_len == 0) { break; } @@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename) tmp.resize(binary_len); cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len); cl::Context context = arm_compute::CLScheduler::get().context(); - cl::Program::Binaries binaries{ binary }; + cl::Program::Binaries binaries{binary}; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); cl::Program program(context, devices, binaries); program.build(); @@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename) void save_program_cache_to_file(const std::string &filename) { - if(CLScheduler::get().is_initialised()) + if (CLScheduler::get().is_initialised()) { std::ofstream cache_file(filename, std::ios::binary); - if(cache_file.is_open()) + if (cache_file.is_open()) { - for(const auto &it : CLKernelLibrary::get().get_built_programs()) + for (const auto &it : CLKernelLibrary::get().get_built_programs()) { std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>(); ARM_COMPUTE_ERROR_ON(binaries.size() != 1); diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index f324b1a68c..c035644e4a 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLRuntimeContext.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClActivation.h" @@ -35,18 +36,17 @@ namespace arm_compute { struct CLActivationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr<opencl::ClActivation> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr<opencl::ClActivation> op{nullptr}; }; -CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) - : _impl(std::make_unique<Impl>()) +CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) { _impl->ctx = ctx; } -CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; +CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default; CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default; CLActivationLayer::~CLActivationLayer() = default; @@ -55,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info); } -void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +void CLActivationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -66,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info); } -Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return opencl::ClActivation::validate(input, output, act_info); } diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp index b30d739025..f9bbd31e8a 100644 --- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp +++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp @@ -27,31 +27,39 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis() + : _memory_group(std::move(memory_manager)), + _not_reshaped_output(), + _arg_min_max_kernel(), + _reshape(), + _reduction_axis() { } CLArgMinMaxLayer::~CLArgMinMaxLayer() = default; -Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); DataType output_data_type = DataType::S32; @@ -59,17 +67,18 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - if(output->total_size() != 0) + if (output->total_size() != 0) { output_data_type = output->data_type(); - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } auto shape_before_reshape = input->tensor_shape(); shape_before_reshape.set(axis, 1); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; @@ -85,20 +94,36 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op); } -void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op) +void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int axis, + ICLTensor *output, + const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); _reduction_axis = axis; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); - - TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() }; + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + DataType output_data_type = + (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + TensorShape not_reshaped_output_shape{input->info()->tensor_shape()}; not_reshaped_output_shape.set(axis, 1); - auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(not_reshaped_output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>(); _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op); diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index e8affc0853..0c371c4171 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -30,9 +30,8 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" namespace arm_compute { @@ -43,24 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer() CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default; -void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon, +void CLBatchNormalizationLayer::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, float epsilon, - ActivationLayerInfo act_info) +void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info); } -Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info); } @@ -69,4 +84,4 @@ void CLBatchNormalizationLayer::run() { CLScheduler::get().enqueue(*_norm_kernel, true); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp index d7a409128d..a3798daf61 100644 --- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp @@ -30,14 +30,12 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" namespace arm_compute { -CLBatchToSpaceLayer::CLBatchToSpaceLayer() - : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>()) +CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>()) { } @@ -49,29 +47,43 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, block_shape, output); _batch_to_space_kernel->configure(compile_context, input, block_shape, output); } -void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayer::configure( + const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output); _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return CLBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp index a4712ed3f1..7bfd0e3677 100644 --- a/src/runtime/CL/functions/CLBitwiseAnd.cpp +++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -36,11 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseAnd::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<CLBitwiseKernel>(); k->configure(compile_context, input1, input2, output, BitwiseOperation::AND); _kernel = std::move(k); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp index 5964b92447..9763915c02 100644 --- a/src/runtime/CL/functions/CLBitwiseNot.cpp +++ b/src/runtime/CL/functions/CLBitwiseNot.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -43,4 +42,4 @@ void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLT k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT); _kernel = std::move(k); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp index a07bf17bb2..dd3171b982 100644 --- a/src/runtime/CL/functions/CLBitwiseOr.cpp +++ b/src/runtime/CL/functions/CLBitwiseOr.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -36,11 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseOr::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<CLBitwiseKernel>(); k->configure(compile_context, input1, input2, output, BitwiseOperation::OR); _kernel = std::move(k); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp index f65e2e406c..5bee4b37ec 100644 --- a/src/runtime/CL/functions/CLBitwiseXor.cpp +++ b/src/runtime/CL/functions/CLBitwiseXor.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h" -#include "src/core/CL/kernels/CLBitwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBitwiseKernel.h" #include <utility> @@ -36,7 +35,10 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLBitwiseXor::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<CLBitwiseKernel>(); diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp index 48583bfaf3..76e626fd75 100644 --- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp +++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp @@ -23,18 +23,24 @@ */ #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h" -#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" namespace arm_compute { -void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); @@ -44,7 +50,10 @@ void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, _kernel = std::move(k); } -Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp index 10f7cc2065..42ec8f7ee0 100644 --- a/src/runtime/CL/functions/CLCast.cpp +++ b/src/runtime/CL/functions/CLCast.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCast.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include <utility> @@ -37,16 +37,15 @@ namespace arm_compute { struct CLCast::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCast> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCast> op{nullptr}; }; -CLCast::CLCast() - : _impl(std::make_unique<Impl>()) +CLCast::CLCast() : _impl(std::make_unique<Impl>()) { } -CLCast::CLCast(CLCast &&) = default; +CLCast::CLCast(CLCast &&) = default; CLCast &CLCast::operator=(CLCast &&) = default; CLCast::~CLCast() = default; @@ -55,7 +54,10 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy configure(CLKernelLibrary::get().get_compile_context(), input, output, policy); } -void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy) +void CLCast::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, policy); @@ -74,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con void CLCast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp index 021f28f238..1ee4789816 100644 --- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp +++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { ARM_COMPUTE_LOG_PARAMS(input, output, num_groups); auto k = std::make_unique<CLChannelShuffleLayerKernel>(); diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp index 192a266f0f..2f54371e88 100644 --- a/src/runtime/CL/functions/CLComparison.cpp +++ b/src/runtime/CL/functions/CLComparison.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLComparisonKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLComparisonKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { @@ -37,25 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparison::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation); auto k = std::make_unique<CLComparisonKernel>(); k->configure(compile_context, input1, input2, output, operation); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } -Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { return CLComparisonKernel::validate(input1, input2, output, operation); } @@ -67,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC } template <ComparisonOperation COP> -void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { auto k = std::make_unique<CLComparisonKernel>(); k->configure(compile_context, input1, input2, output, COP); _kernel = std::move(k); - if(output->info()->dimension(0) > 1) + if (output->info()->dimension(0) > 1) { ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - if(broadcasted_info->info()->dimension(0) == 1) + if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), + BorderMode::REPLICATE); } } } template <ComparisonOperation COP> -Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status +CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { return CLComparisonKernel::validate(input1, input2, output, COP); } diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index 0a8884f4e3..9df1c34593 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -24,24 +24,23 @@ #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClConcatenate.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConcatenate.h" namespace arm_compute { struct CLConcatenateLayer::Impl { std::vector<const ICLTensor *> srcs{}; - ICLTensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr<opencl::ClConcatenate> op{ nullptr }; + ICLTensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr<opencl::ClConcatenate> op{nullptr}; }; -CLConcatenateLayer::CLConcatenateLayer() - : _impl(std::make_unique<Impl>()) +CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>()) { } @@ -56,7 +55,10 @@ void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); } -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, + std::vector<const ICLTensor *> &inputs_vector, + ICLTensor *output, + size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis); @@ -68,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op = std::make_unique<opencl::ClConcatenate>(); std::vector<ITensorInfo *> inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -76,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std: _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis); } -Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis) +Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, + const ITensorInfo *output, + size_t axis) { return opencl::ClConcatenate::validate(inputs_vector, output, axis); } @@ -84,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu void CLConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp index 729b973b6a..9d1b368f72 100644 --- a/src/runtime/CL/functions/CLConv3D.cpp +++ b/src/runtime/CL/functions/CLConv3D.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/CL/functions/CLConv3D.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/gpu/cl/operators/ClDirectConv3d.h" namespace arm_compute @@ -32,29 +33,38 @@ using namespace arm_compute::experimental; struct CLConv3D::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClDirectConv3d> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDirectConv3d> op{nullptr}; }; -CLConv3D::CLConv3D() - : _impl(std::make_unique<Impl>()) +CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>()) { } CLConv3D::~CLConv3D() = default; -void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info) +void CLConv3D::configure(const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) { configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info); } -void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info) +void CLConv3D::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate( + src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info)); _impl->src = src; _impl->weights = weights; @@ -62,10 +72,15 @@ void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTenso _impl->dst = dst; _impl->op = std::make_unique<opencl::ClDirectConv3d>(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); + _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), + _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info); } -Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status CLConv3D::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info); } diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp index b3efe5c8a0..2298f2a669 100644 --- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp +++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp @@ -27,33 +27,37 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" namespace arm_compute { struct CLConvertFullyConnectedWeights::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr}; }; -CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() - : _impl(std::make_unique<Impl>()) +CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>()) { } CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default; -void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout); } -void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout); @@ -63,8 +67,10 @@ void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_c _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index f3c05adb47..7767b45a01 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -28,11 +28,11 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClConv2d.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -43,41 +43,59 @@ struct CLConvolutionLayer::Impl { MemoryGroup memory_group{}; std::shared_ptr<IMemoryManager> memory_manager{}; - std::unique_ptr<opencl::IClOperator> op{ nullptr }; + std::unique_ptr<opencl::IClOperator> op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData<CLTensor> workspace{}; experimental::MemoryRequirements aux_mem_req{}; - std::unique_ptr<IFunction> func{ nullptr }; + std::unique_ptr<IFunction> func{nullptr}; }; -CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _impl(std::make_unique<Impl>()) +CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>()) { _impl->memory_manager = std::move(memory_manager); } CLConvolutionLayer::~CLConvolutionLayer() = default; -void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math, num_groups); } -void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CLConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, - weights_info, CLScheduler::get().target())) + switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info, + weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::DIRECT: @@ -85,7 +103,8 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT case ConvolutionMethod::GEMM: { auto f = std::make_unique<opencl::ClConv2d>(); - f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + f->configure(compile_context, input->info(), weights->info(), + ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); _impl->op = std::move(f); break; } @@ -101,40 +120,52 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT break; } - if(_impl->op) + if (_impl->op) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CLConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); const GPUTarget gpu_target = CLScheduler::get().target(); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) + switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::DIRECT: case ConvolutionMethod::INDIRECT: case ConvolutionMethod::GEMM: { - ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info)); break; } case ConvolutionMethod::FFT: { // Validate FFT-based convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, + act_info, enable_fast_math)); break; } default: @@ -145,8 +176,15 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math) +ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const ActivationLayerInfo &act_info, + const GPUTarget gpu_target, + const Size2D &dilation, + bool enable_fast_math) { const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1); return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target); @@ -158,7 +196,7 @@ void CLConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(_impl->func) + if (_impl->func) { _impl->func->run(); } @@ -170,7 +208,7 @@ void CLConvolutionLayer::run() void CLConvolutionLayer::prepare() { - if(_impl->func) + if (_impl->func) { _impl->func->prepare(); } diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp index 56400b67a0..a4f2b0634f 100644 --- a/src/runtime/CL/functions/CLCopy.cpp +++ b/src/runtime/CL/functions/CLCopy.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCopy.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCopy.h" #include <utility> @@ -38,16 +38,15 @@ namespace arm_compute { struct CLCopy::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCopy> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCopy> op{nullptr}; }; -CLCopy::CLCopy() - : _impl(std::make_unique<Impl>()) +CLCopy::CLCopy() : _impl(std::make_unique<Impl>()) { } -CLCopy::CLCopy(CLCopy &&) = default; +CLCopy::CLCopy(CLCopy &&) = default; CLCopy &CLCopy::operator=(CLCopy &&) = default; CLCopy::~CLCopy() = default; diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp index 35ea17cfc2..fc29c43827 100644 --- a/src/runtime/CL/functions/CLCrop.cpp +++ b/src/runtime/CL/functions/CLCrop.cpp @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCrop.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCrop.h" #include <utility> @@ -38,27 +38,38 @@ namespace arm_compute { struct CLCrop::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCrop> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCrop> op{nullptr}; }; -CLCrop::CLCrop() - : _impl(std::make_unique<Impl>()) +CLCrop::CLCrop() : _impl(std::make_unique<Impl>()) { } -CLCrop::CLCrop(CLCrop &&) = default; +CLCrop::CLCrop(CLCrop &&) = default; CLCrop &CLCrop::operator=(CLCrop &&) = default; CLCrop::~CLCrop() = default; -void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { - configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window); + configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, + dst_window); } -void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void CLCrop::configure(const CLCompileContext &compile_context, + const ICLTensor *src, + ICLTensor *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); @@ -67,10 +78,17 @@ void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor _impl->dst = dst; _impl->op = std::make_unique<opencl::ClCrop>(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, + extrapolation_value, dst_window); } -Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status CLCrop::validate(const ITensorInfo *input, + const ITensorInfo *output, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window); } diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp index d8fc38d99e..821412b149 100644 --- a/src/runtime/CL/functions/CLCropResize.cpp +++ b/src/runtime/CL/functions/CLCropResize.cpp @@ -25,19 +25,26 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/common/utils/Log.h" - #include <cstddef> namespace arm_compute { namespace { -inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index) +inline void configure_crop(const ICLTensor *input, + ICLTensor *crop_boxes, + ICLTensor *box_ind, + ICLTensor *output, + uint32_t crop_box_ind, + Coordinates &start, + Coordinates &end, + uint32_t &batch_index) { batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind)))); @@ -50,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers. start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1); + end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, + static_cast<uint32_t>(abs(end[1] - start[1])) + 1); output->info()->set_tensor_shape(out_shape); } } // namespace CLCropResize::CLCropResize() - : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions() + : _input(nullptr), + _boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _scale(), + _copy(), + _crop_results(), + _scaled_results(), + _internal_functions() { } CLCropResize::~CLCropResize() = default; -Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status CLCropResize::validate(const ITensorInfo *input, + ITensorInfo *boxes, + ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1}, + input->dimension(3) - 1, extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -83,20 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen return Status{}; } -void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { - configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value); + configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, + extrapolation_value); } -void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void CLCropResize::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *boxes, + ICLTensor *box_ind, + ICLTensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind); - ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); - TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); + TensorShape output_shape = + TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]); auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -122,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // kernels used for cropping and scaling. _boxes->map(CLScheduler::get().queue()); _box_ind->map(CLScheduler::get().queue()); - for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box) + for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box) { auto crop_tensor = std::make_unique<CLTensor>(); TensorInfo crop_result_info(1, DataType::F32); @@ -143,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index); auto scale_kernel = std::make_unique<CLScale>(); - scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT }); + scale_kernel->configure( + compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT}); _scale.emplace_back(std::move(scale_kernel)); Window win = calculate_max_window(*_output->info()); @@ -159,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT bool is_width_flipped = end[0] < start[0]; bool is_height_flipped = end[1] < start[1]; /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array<int32_t, 2> rows_out_of_bounds{ 0 }; + std::array<int32_t, 2> rows_out_of_bounds{0}; /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */ - std::array<int32_t, 2> cols_out_of_bounds{ 0 }; - if(is_height_flipped) + std::array<int32_t, 2> cols_out_of_bounds{0}; + if (is_height_flipped) { - rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; - rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0; + rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(start[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; + rows_out_of_bounds[1] = + end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) + : 0; } else { - rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0; - rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0; + rows_out_of_bounds[0] = + start[1] < 0 + ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) + : 0; + rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(end[1] - _input->info()->dimension(2) + 1, + _crop_results[num_box].get()->info()->dimension(2)) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; - cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0; + cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(start[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; + cols_out_of_bounds[1] = + end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) + : 0; } else { - cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0; - cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0; + cols_out_of_bounds[0] = + start[0] < 0 + ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) + : 0; + cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(end[0] - _input->info()->dimension(1) + 1, + _crop_results[num_box].get()->info()->dimension(1)) + : 0; } Window full_window = calculate_max_window(*_crop_results[num_box].get()->info()); @@ -203,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds // with the extrapolation value using memset. // First for the rows before the in bounds rows. - if(rows_out_of_bounds[0] > 0) + if (rows_out_of_bounds[0] > 0) { Window slice_fill_rows_before(full_window); slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } Window slice_in(full_window); - slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); - slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); - - int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1]; - if(rows_in_bounds > 0) + slice_in.set(2, + Window::Dimension(rows_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1)); + slice_in.set(1, + Window::Dimension(cols_out_of_bounds[0], + _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1)); + + int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - + rows_out_of_bounds[0] - rows_out_of_bounds[1]; + if (rows_in_bounds > 0) { // Fill all elements that share a row with an in bounds element with the extrapolation value. - if(cols_out_of_bounds[0] > 0) + if (cols_out_of_bounds[0] > 0) { Window slice_fill_cols_before(slice_in); slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_before); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } - if(cols_out_of_bounds[1] > 0) + if (cols_out_of_bounds[1] > 0) { Window slice_fill_cols_after(slice_in); - slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1)); + slice_fill_cols_after.set( + 1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(1), 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_cols_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } // Copy all elements within the input bounds from the input tensor. - int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1]; - if(cols_in_bounds > 0) + int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - + cols_out_of_bounds[0] - cols_out_of_bounds[1]; + if (cols_in_bounds > 0) { - Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], - is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] }; - Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, - is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 }; + Coordinates2D start_in{ + is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0], + is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]}; + Coordinates2D end_in{ + is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1, + is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1}; auto kernel = std::make_unique<CLCrop>(); - kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in); + kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, + extrapolation_value, &slice_in); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } } // Fill all rows after the in bounds elements with the extrapolation value. - if(rows_out_of_bounds[1] > 0) + if (rows_out_of_bounds[1] > 0) { Window slice_fill_rows_after(full_window); - slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1)); + slice_fill_rows_after.set( + 2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], + _crop_results[num_box].get()->info()->dimension(2), 1)); auto kernel = std::make_unique<CLFill>(); - kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after); + kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, + &slice_fill_rows_after); //_internal_functions.emplace_back(std::move(kernel)); _internal_functions.push_back(std::move(kernel)); } @@ -277,18 +357,18 @@ void CLCropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _internal_functions.size(); ++i) + for (unsigned int i = 0; i < _internal_functions.size(); ++i) { _internal_functions[i]->run(); } CLScheduler::get().sync(); - for(auto &kernel : _scale) + for (auto &kernel : _scale) { kernel->run(); } CLScheduler::get().sync(); - for(auto &kernel : _copy) + for (auto &kernel : _copy) { kernel->run(); } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 4421a18f2a..e988ab0ac4 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -25,16 +25,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/IClOperator.h" #include "src/gpu/cl/operators/ClTransposedConvolution.h" -#include "src/common/utils/Log.h" - #include <cmath> #include <memory> #include <tuple> @@ -44,11 +44,11 @@ using namespace arm_compute::misc::shape_calculator; struct CLDeconvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::IClOperator> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::IClOperator> op{nullptr}; }; CLDeconvolutionLayer::~CLDeconvolutionLayer() = default; @@ -58,24 +58,35 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor { } -void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info); } -void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info); - switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), + deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { auto op = std::make_unique<opencl::ClTransposedConvolution>(); - op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info); + op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, + output->info(), deconv_info); _impl->src = input; _impl->weights = weights; @@ -105,22 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC } } -Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +Status CLDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) + switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info)) { case DeconvolutionMethod::DIRECT: { // Validate transposed convolution operator - ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info)); break; } case DeconvolutionMethod::UPSCALE_CONV2D: { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info)); break; } case DeconvolutionMethod::GEMM: @@ -137,12 +154,16 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info, - const WeightsInfo &weights_info) +DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &deconv_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { return DeconvolutionMethod::UPSCALE_CONV2D; } @@ -154,11 +175,12 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); const size_t ofm = weights->tensor_shape()[idx_n]; - if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second) + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second) { // We observe better performance for FP32 types only when ofm <= 16. // A better heuristic is required for selecting the method for FP16 data types. - if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16))) + if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16))) { return DeconvolutionMethod::DIRECT; } @@ -175,7 +197,7 @@ void CLDeconvolutionLayer::run() { prepare(); - if(_impl->op != nullptr) + if (_impl->op != nullptr) { // Optimized Operator will be used ITensorPack pack; @@ -195,7 +217,7 @@ void CLDeconvolutionLayer::run() void CLDeconvolutionLayer::prepare() { - if(_impl->op == nullptr) + if (_impl->op == nullptr) { _function->prepare(); } diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index 0b428f5b17..b92bf903a6 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -27,22 +27,21 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTensor.h" -#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" namespace arm_compute { CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT - : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), - _fill(), - _output(nullptr) + : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr) { } CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default; -Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) +Status +CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info) { return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info); } @@ -52,13 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) +void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, info); _output = output; - _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); + _fill.configure(compile_context, _output, + PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); _upsample->configure(compile_context, input, _output, info); } diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp index cac3f51013..6d2fea974e 100644 --- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp +++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClCast.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClCast.h" #include <utility> @@ -37,16 +37,15 @@ namespace arm_compute { struct CLDepthConvertLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClCast> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClCast> op{nullptr}; }; -CLDepthConvertLayer::CLDepthConvertLayer() - : _impl(std::make_unique<Impl>()) +CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>()) { } -CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; +CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default; CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default; CLDepthConvertLayer::~CLDepthConvertLayer() = default; @@ -55,7 +54,11 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift); } -void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift) +void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + ConvertPolicy policy, + uint32_t shift) { ARM_COMPUTE_UNUSED(shift); ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift); @@ -70,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy); } -Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return opencl::ClCast::validate(input, output, policy); @@ -78,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void CLDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp index 98531e7cac..9477c7f81d 100644 --- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp +++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h" -#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h" #include <utility> @@ -36,7 +35,10 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); auto k = std::make_unique<CLDepthToSpaceLayerKernel>(); diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index dcb982fa56..873601bb11 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" -#include "src/common/utils/Log.h" - namespace arm_compute { using namespace arm_compute::misc; @@ -63,25 +63,33 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemory CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default; -void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, + act_info, dilation); } -void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, - ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output != nullptr ? output->info() : input->info(), - conv_info, - depth_multiplier, - act_info, - dilation)); + ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, + output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); _is_quantized = is_data_type_quantized(input->info()->data_type()); @@ -96,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = output; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -119,10 +127,12 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont CLTensor *output_multipliers_to_use = nullptr; CLTensor *output_shifts_to_use = nullptr; - if(_is_quantized) + if (_is_quantized) { - const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); - const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; + const size_t idx_c = + get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const size_t num_filters = + (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); @@ -132,16 +142,18 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont } // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); - const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; _dwc_native_kernel->set_target(gpu_target); _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, - dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use); + dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, + output_shifts_to_use); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); @@ -151,22 +163,27 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont _permuted_output.allocator()->allocate(); } - if(_is_quantized) + if (_is_quantized) { _output_multipliers.allocator()->allocate(); _output_shifts.allocator()->allocate(); } } -Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) + unsigned int depth_multiplier, + ActivationLayerInfo act_info, + const Size2D &dilation) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); const bool in_place = input == output || output == nullptr; - if(in_place) + if (in_place) { output = input; } @@ -174,21 +191,23 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > + input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > + input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); const GPUTarget gpu_target = CLScheduler::get().target(); - const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation}; const bool needs_permute = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized(input->data_type()); TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); - if(is_quantized) + if (is_quantized) { - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); @@ -201,40 +220,57 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe } } - if(needs_permute) + if (needs_permute) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout"); TensorShape permuted_input_shape = input->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); - const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation }; - TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); + const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation}; + TensorShape permuted_output_shape = + shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC); - const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_input = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_weights = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC); + const TensorInfo permuted_output = output->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NHWC); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, - dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info, + &output_multipliers_shifts_info, &output_multipliers_shifts_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); } else { // Get the depthwise convolution compute parameters - auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, - &output_multipliers_shifts_info)); + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = + t->configure(input, weights, conv_info, dilation, depth_multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate( + input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, + &output_multipliers_shifts_info)); } return Status{}; } @@ -245,12 +281,12 @@ void CLDepthwiseConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_needs_permute) + if (_needs_permute) { _permute_input_to_nhwc.run(); } CLScheduler::get().enqueue(*_dwc_native_kernel); - if(_needs_permute) + if (_needs_permute) { _permute_output_to_nchw.run(); } @@ -258,22 +294,21 @@ void CLDepthwiseConvolutionLayer::run() void CLDepthwiseConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_is_quantized) + if (_is_quantized) { _output_multipliers.map(); _output_shifts.map(); - quantization::compute_quantized_multipliers_and_shifts(_input->info(), - _original_weights->info(), - _output != nullptr ? _output->info() : _input->info(), - reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), - reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); + quantization::compute_quantized_multipliers_and_shifts( + _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(), + reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))), + reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); _output_shifts.unmap(); } - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp index 64c6b5d91c..20162a03db 100644 --- a/src/runtime/CL/functions/CLDequantizationLayer.cpp +++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp @@ -26,22 +26,21 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClDequantize.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClDequantize.h" namespace arm_compute { struct CLDequantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClDequantize> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDequantize> op{nullptr}; }; -CLDequantizationLayer::CLDequantizationLayer() - : _impl(std::make_unique<Impl>()) +CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>()) { } CLDequantizationLayer::~CLDequantizationLayer() = default; @@ -51,7 +50,9 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output) configure(CLKernelLibrary::get().get_compile_context(), input, output); } -void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) +void CLDequantizationLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, output); _impl->src = input; diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp index 752e0e4a60..d6dae0d732 100644 --- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -28,37 +28,46 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/operators/ClActivation.h" -#include "src/gpu/cl/operators/ClDirectConv2d.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClActivation.h" +#include "src/gpu/cl/operators/ClDirectConv2d.h" namespace arm_compute { struct CLDirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClDirectConv2d> op{nullptr}; }; -CLDirectConvolutionLayer::CLDirectConvolutionLayer() - : _impl(std::make_unique<Impl>()) +CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>()) { } -CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; +CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default; CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default; -void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); @@ -69,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context _impl->dst = output; _impl->op = std::make_unique<opencl::ClDirectConv2d>(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info); @@ -87,4 +101,4 @@ void CLDirectConvolutionLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index 88c3c6193c..3717f30ae1 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -26,15 +26,15 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - #include <memory> #include <tuple> @@ -55,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa { } -Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); const DataLayout data_layout = input->data_layout(); @@ -70,20 +75,22 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - if(input->data_type() != weights->data_type()) + if (input->data_type() != weights->data_type()) { - ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || + !is_data_type_quantized_asymmetric(input->data_type())); } - if(bias != nullptr) + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -102,24 +109,39 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen unsigned int deconv_pad_y = 0; const unsigned int stride_x = info.stride().first; const unsigned int stride_y = info.stride().second; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } -void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info); } -void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, - const WeightsInfo &weights_info) +void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info); @@ -141,15 +163,19 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); + auto out_dims = + deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info)); _is_prepared = weights_info.retain_internal_weights(); @@ -158,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape unsigned int deconv_pad_x = 0; unsigned int deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0; unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0; @@ -179,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _scaled_output.allocator()->init(scale_out_info); // configure scale function - const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, + deconv_pad_bottom, DimensionRoundingType::FLOOR); _scale_f.configure(compile_context, input, &_scaled_output, upsample_info); // Setup the function to convolve the upscaled output @@ -191,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte _flip_axis.allocator()->allocate(); _flip_axis.map(true); auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer()); - if(weights->info()->data_layout() == DataLayout::NHWC) + if (weights->info()->data_layout() == DataLayout::NHWC) { axis_data[0] = 1; axis_data[1] = 2; @@ -216,7 +244,7 @@ void CLDirectDeconvolutionLayer::run() void CLDirectDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -229,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare() _conv_f.prepare(); // Free flipped weights - if(!_weights_flipped.is_used()) + if (!_weights_flipped.is_used()) { _weights_flipped.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 936b37fb31..d9529f0b7f 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -26,8 +26,8 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClAdd.h" #include "src/gpu/cl/operators/ClElementwiseOperations.h" #include "src/gpu/cl/operators/ClSub.h" @@ -36,26 +36,30 @@ namespace arm_compute { struct CLArithmeticAddition::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClAdd> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClAdd> op{nullptr}; }; -CLArithmeticAddition::CLArithmeticAddition() - : _impl(std::make_unique<Impl>()) +CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>()) { } -CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; +CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default; CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default; CLArithmeticAddition::~CLArithmeticAddition() = default; -void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticAddition::configure( + ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticAddition::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClAdd::validate(input1, input2, output, policy, act_info); } @@ -82,26 +90,33 @@ void CLArithmeticAddition::run() struct CLArithmeticSubtraction::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClSub> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClSub> op{nullptr}; }; -CLArithmeticSubtraction::CLArithmeticSubtraction() - : _impl(std::make_unique<Impl>()) +CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>()) { } -CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; +CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default; CLArithmeticSubtraction::~CLArithmeticSubtraction() = default; -void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CLArithmeticSubtraction::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info); } -void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, +void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ConvertPolicy policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; @@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info); } -Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return opencl::ClSub::validate(input1, input2, output, policy, act_info); } @@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run() struct CLArithmeticDivision::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr}; }; -CLArithmeticDivision::CLArithmeticDivision() - : _impl(std::make_unique<Impl>()) +CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>()) { } -CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; +CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default; CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default; CLArithmeticDivision::~CLArithmeticDivision() = default; -void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticDivision::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLArithmeticDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info); } @@ -173,26 +201,32 @@ void CLArithmeticDivision::run() struct CLElementwiseMax::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseMax> op{nullptr}; }; -CLElementwiseMax::CLElementwiseMax() - : _impl(std::make_unique<Impl>()) +CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>()) { } -CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; +CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default; CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default; CLElementwiseMax::~CLElementwiseMax() = default; -void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMax::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMax::validate(input1, input2, output, act_info); } @@ -218,26 +255,32 @@ void CLElementwiseMax::run() struct CLElementwiseMin::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseMin> op{nullptr}; }; -CLElementwiseMin::CLElementwiseMin() - : _impl(std::make_unique<Impl>()) +CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>()) { } -CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; +CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default; CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default; CLElementwiseMin::~CLElementwiseMin() = default; -void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseMin::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseMin::validate(input1, input2, output, act_info); } @@ -263,26 +309,32 @@ void CLElementwiseMin::run() struct CLElementwiseSquaredDiff::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr}; }; -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() - : _impl(std::make_unique<Impl>()) +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>()) { } -CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; +CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default; CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff() = default; -void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info); } @@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run() struct CLElementwisePower::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClElementwisePower> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClElementwisePower> op{nullptr}; }; -CLElementwisePower::CLElementwisePower() - : _impl(std::make_unique<Impl>()) +CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>()) { } -CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; +CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default; CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default; CLElementwisePower::~CLElementwisePower() = default; -void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLElementwisePower::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClElementwisePower::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp index 9dcd2d1891..3043c26feb 100644 --- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp +++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClElementwiseUnary.h" @@ -32,17 +33,16 @@ namespace arm_compute { struct CLRsqrtLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClRsqrt> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClRsqrt> op{nullptr}; }; -CLRsqrtLayer::CLRsqrtLayer() - : _impl(std::make_unique<Impl>()) +CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>()) { } -CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; +CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default; CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default; CLRsqrtLayer::~CLRsqrtLayer() = default; @@ -74,17 +74,16 @@ void CLRsqrtLayer::run() struct CLExpLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClExp> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClExp> op{nullptr}; }; -CLExpLayer::CLExpLayer() - : _impl(std::make_unique<Impl>()) +CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>()) { } -CLExpLayer::CLExpLayer(CLExpLayer &&) = default; +CLExpLayer::CLExpLayer(CLExpLayer &&) = default; CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default; CLExpLayer::~CLExpLayer() = default; @@ -116,17 +115,16 @@ void CLExpLayer::run() struct CLNegLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClNeg> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClNeg> op{nullptr}; }; -CLNegLayer::CLNegLayer() - : _impl(std::make_unique<Impl>()) +CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>()) { } -CLNegLayer::CLNegLayer(CLNegLayer &&) = default; +CLNegLayer::CLNegLayer(CLNegLayer &&) = default; CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default; CLNegLayer::~CLNegLayer() = default; @@ -157,17 +155,16 @@ void CLNegLayer::run() struct CLSinLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClSin> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClSin> op{nullptr}; }; -CLSinLayer::CLSinLayer() - : _impl(std::make_unique<Impl>()) +CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>()) { } -CLSinLayer::CLSinLayer(CLSinLayer &&) = default; +CLSinLayer::CLSinLayer(CLSinLayer &&) = default; CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default; CLSinLayer::~CLSinLayer() = default; @@ -198,17 +195,16 @@ void CLSinLayer::run() struct CLAbsLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClAbs> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClAbs> op{nullptr}; }; -CLAbsLayer::CLAbsLayer() - : _impl(std::make_unique<Impl>()) +CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>()) { } -CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; +CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default; CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default; CLAbsLayer::~CLAbsLayer() = default; @@ -239,17 +235,16 @@ void CLAbsLayer::run() struct CLLogLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClLog> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClLog> op{nullptr}; }; -CLLogLayer::CLLogLayer() - : _impl(std::make_unique<Impl>()) +CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>()) { } -CLLogLayer::CLLogLayer(CLLogLayer &&) = default; +CLLogLayer::CLLogLayer(CLLogLayer &&) = default; CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default; CLLogLayer::~CLLogLayer() = default; @@ -280,17 +275,16 @@ void CLLogLayer::run() struct CLRoundLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClRound> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClRound> op{nullptr}; }; -CLRoundLayer::CLRoundLayer() - : _impl(std::make_unique<Impl>()) +CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>()) { } -CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; +CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default; CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default; CLRoundLayer::~CLRoundLayer() = default; diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp index bd0966b65f..48e9ae824a 100644 --- a/src/runtime/CL/functions/CLFFT1D.cpp +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -26,13 +26,13 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" #include "src/core/utils/helpers/fft.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager) @@ -54,7 +54,10 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) +void CLFFT1D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT1DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config)); @@ -77,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); _memory_group.manage(&_digit_reversed_input); - _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config); + _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, + digit_reverse_config); // Create and configure FFT kernels unsigned int Nx = 1; _num_ffts = decomposed_vector.size(); _fft_kernels.reserve(_num_ffts); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -93,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>()); - _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, + ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast<float>(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; - is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -123,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); @@ -132,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); @@ -151,13 +157,13 @@ void CLFFT1D::run() CLScheduler::get().enqueue(*_digit_reverse_kernel, false); // Run radix kernels - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale); } // Run output scaling - if(_run_scale) + if (_run_scale) { CLScheduler::get().enqueue(*_scale_kernel, true); } diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp index 94fc411355..3857046719 100644 --- a/src/runtime/CL/functions/CLFFT2D.cpp +++ b/src/runtime/CL/functions/CLFFT2D.cpp @@ -26,16 +26,19 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -46,7 +49,10 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config) +void CLFFT2D::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const FFT2DInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config)); @@ -88,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp index d12e2de3bf..3894b10785 100644 --- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp @@ -25,10 +25,12 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CPP/CPPScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" @@ -38,8 +40,6 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace @@ -50,11 +50,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -104,17 +104,31 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem { } -void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); - ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), conv_info, act_info, enable_fast_math)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); _original_weights = weights; @@ -124,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -146,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -167,7 +184,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -175,10 +192,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -202,7 +219,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -214,25 +232,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, + Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -243,7 +264,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -255,7 +276,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(compile_context, output, nullptr, act_info); } @@ -269,8 +290,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I _flip_axis.unmap(); } -Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math); @@ -287,24 +313,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x()); } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); } @@ -320,7 +349,7 @@ void CLFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -336,17 +365,17 @@ void CLFFTConvolutionLayer::run() _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer()); _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -354,10 +383,10 @@ void CLFFTConvolutionLayer::run() void CLFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -366,7 +395,7 @@ void CLFFTConvolutionLayer::prepare() const ICLTensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp index 6019a84aba..9bd96a975e 100644 --- a/src/runtime/CL/functions/CLFill.cpp +++ b/src/runtime/CL/functions/CLFill.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClFill.h" @@ -36,16 +37,15 @@ namespace arm_compute { struct CLFill::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClFill> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFill> op{nullptr}; }; -CLFill::CLFill() - : _impl(std::make_unique<Impl>()) +CLFill::CLFill() : _impl(std::make_unique<Impl>()) { } -CLFill::CLFill(CLFill &&) = default; +CLFill::CLFill(CLFill &&) = default; CLFill &CLFill::operator=(CLFill &&) = default; CLFill::~CLFill() = default; @@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window); } -void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window) +void CLFill::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + const PixelValue &constant_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index 32fc37552c..ba1b5372d3 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -26,8 +26,9 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/operators/ClFlatten.h" @@ -36,16 +37,15 @@ namespace arm_compute { struct CLFlattenLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClFlatten> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFlatten> op{nullptr}; }; -CLFlattenLayer::CLFlattenLayer() - : _impl(std::make_unique<Impl>()) +CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>()) { } -CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; +CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default; CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default; CLFlattenLayer::~CLFlattenLayer() = default; @@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique<opencl::ClFlatten>(); _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info()); @@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return opencl::ClFlatten::validate(input, output); @@ -83,4 +85,4 @@ void CLFlattenLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp index 8739e1803e..4322219dd9 100644 --- a/src/runtime/CL/functions/CLFloor.cpp +++ b/src/runtime/CL/functions/CLFloor.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClFloor.h" @@ -34,16 +35,15 @@ namespace arm_compute { struct CLFloor::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClFloor> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClFloor> op{nullptr}; }; -CLFloor::CLFloor() - : _impl(std::make_unique<Impl>()) +CLFloor::CLFloor() : _impl(std::make_unique<Impl>()) { } -CLFloor::CLFloor(CLFloor &&) = default; +CLFloor::CLFloor(CLFloor &&) = default; CLFloor &CLFloor::operator=(CLFloor &&) = default; CLFloor::~CLFloor() = default; diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 1c162db79a..b30f9e701f 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClFullyConnected.h" @@ -35,21 +36,22 @@ using namespace arm_compute::experimental; struct CLFullyConnectedLayer::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr<opencl::ClFullyConnected> op{ nullptr }; + std::unique_ptr<opencl::ClFullyConnected> op{nullptr}; - const ITensor *original_weights{ nullptr }; + const ITensor *original_weights{nullptr}; ITensorPack run_pack{}; WorkspaceData<CLTensor> workspace{}; experimental::MemoryRequirements aux_mem_req{}; - bool is_prepared{ false }; - bool dynamic_weights{ false }; + bool is_prepared{false}; + bool dynamic_weights{false}; }; -CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) +CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique<Impl>()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); @@ -58,39 +60,45 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> mem CLFullyConnectedLayer::~CLFullyConnectedLayer() = default; -void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info); } -void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(), - weights->info(), - biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info)); _impl->op = std::make_unique<opencl::ClFullyConnected>(); _impl->original_weights = weights; _impl->is_prepared = fc_info.retain_internal_weights; - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); - if(_impl->weights_manager != nullptr) + if (_impl->weights_manager != nullptr) { _impl->weights_manager->manage(_impl->original_weights); } - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } else { @@ -98,14 +106,14 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c _impl->run_pack.add_tensor(ACL_DST, output); } - _impl->dynamic_weights = - !weights->info()->are_values_constant() && - fc_info.transpose_weights && - !fc_info.are_weights_reshaped && - !fc_info.retain_internal_weights; + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status CLFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, FullyConnectedLayerInfo fc_info) { return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info); @@ -113,7 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn void CLFullyConnectedLayer::run() { - if(!_impl->dynamic_weights) + if (!_impl->dynamic_weights) { prepare(); } @@ -124,7 +132,7 @@ void CLFullyConnectedLayer::run() void CLFullyConnectedLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -133,13 +141,13 @@ void CLFullyConnectedLayer::prepare() _impl->is_prepared = true; // Handle weights managed infrastructure - if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare // This is for cases where multiple functions share the same b (weights) // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference const ITensor *original_b = _impl->original_weights; - if(!original_b->is_used()) + if (!original_b->is_used()) { _impl->weights_manager->pre_mark_as_unused(original_b); } diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp index 7379e9d9fe..e4fbf78e13 100644 --- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp +++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" namespace arm_compute { @@ -41,29 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization() CLFuseBatchNormalization::~CLFuseBatchNormalization() = default; -void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); - _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); + _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, + bn_beta, bn_gamma, epsilon, fbn_type); } -Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void CLFuseBatchNormalization::run() diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 427ea51ab9..871a1d6e27 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemm.h" @@ -40,15 +41,15 @@ using OperatorType = opencl::ClGemm; struct CLGEMM::Impl { - const ICLTensor *b{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ICLTensor *b{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryRequirements aux_mem_req{}; WorkspaceData<CLTensor> workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) @@ -60,12 +61,25 @@ CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager * CLGEMM::~CLGEMM() = default; -void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info); } -void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info) +void CLGEMM::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -73,25 +87,33 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor _impl->op = std::make_unique<OperatorType>(); _impl->is_prepared = gemm_info.retain_internal_weights(); - _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + alpha, beta, gemm_info); _impl->aux_mem_req = _impl->op->workspace(); // Manage/allocate auxilairy tensors - if(_impl->is_prepared) + if (_impl->is_prepared) { _impl->run_pack.add_const_tensor(ACL_SRC_0, a); _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, _impl->b } }; + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, _impl->b}}; - _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status CLGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info); } @@ -107,15 +129,15 @@ void CLGEMM::run() void CLGEMM::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->b->mark_as_unused(); } diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index c8c18f35db..aef7cddd7a 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -27,10 +27,11 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemmConv2d.h" #include "support/Cast.h" @@ -47,18 +48,19 @@ using namespace arm_compute::experimental; struct CLGEMMConvolutionLayer::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr<opencl::ClGemmConv2d> op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr<opencl::ClGemmConv2d> op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData<CLTensor> workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) +CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique<Impl>()) { _impl->memory_group = MemoryGroup(memory_manager); @@ -67,40 +69,60 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default; -void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, + dilation, act_info, num_groups); } -void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); _impl->weights = weights; _impl->op = std::make_unique<opencl::ClGemmConv2d>(); const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input }, - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, - { TensorType::ACL_DST, output } - }; - _impl->prep_pack = - { - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = { + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups) +Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + unsigned int num_groups) { const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups); return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info); @@ -115,14 +137,14 @@ void CLGEMMConvolutionLayer::run() void CLGEMMConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->weights->mark_as_unused(); } diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index 9fc81c11da..7d40cf1829 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -24,15 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include <tuple> @@ -40,12 +40,13 @@ namespace arm_compute { namespace { -std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) +std::pair<Coordinates, Coordinates> +compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw) { Coordinates start; Coordinates end; - if(is_nchw) + if (is_nchw) { start.set(0, deconv_info.pad_left()); start.set(1, deconv_info.pad_top()); @@ -63,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT end.set(2, output_info.dimension(2) - deconv_info.pad_bottom()); } - return { start, end }; + return {start, end}; } -Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info) +Status construct_gemmlowp_output_stage(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + GEMMLowpOutputStageInfo &output_stage_info) { const auto data_type = input->data_type(); - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); @@ -78,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier(0); int output_shift(0); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -122,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default; -Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info) +Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); DataLayout data_layout = input->data_layout(); - const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; + const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || + deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; const bool is_nchw = input->data_layout() == DataLayout::NCHW; const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); @@ -144,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso TensorShape nhwc_weights_shape = weights->tensor_shape(); TensorShape nhwc_input_shape = input->tensor_shape(); - if(is_nchw) + if (is_nchw) { permute(nhwc_weights_shape, PermutationVector(2, 0, 1)); permute(nhwc_input_shape, PermutationVector(2, 0, 1)); - TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_input_info = input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_input_shape) + .set_data_layout(DataLayout::NCHW); - TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW); + TensorInfo nhwc_weights_info = weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(nhwc_weights_shape) + .set_data_layout(DataLayout::NCHW); CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1)); CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1)); } - const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); - const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); + const TensorShape reshaped_shape = + TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]); + const TensorInfo reshaped_info = + weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info)); TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]); @@ -166,77 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info)); TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b), - input->dimension(idx_w), - input->dimension(idx_h), - input->dimension(idx_b)); + input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b)); TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true); GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true); GEMMLowpOutputStageInfo output_stage_info; - if(is_quantized) + if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32), - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( + &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, + &gemm_output_info.set_data_type(DataType::S32), gemm_info)); ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), + &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info)); } const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info); - const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); - TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), + weights->dimension(idx_w), weights->dimension(idx_h), stride_info); + const TensorShape deconv_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); - if(padded_input && is_quantized) + if (padded_input && is_quantized) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate( + &col2im_output_info, nullptr, + &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), + output, start_end.first, start_end.second)); } - else if(padded_input) + else if (padded_input) { const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw); - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second)); } - else if(is_quantized) + else if (is_quantized) { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate( + &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info)); } return Status{}; } -void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info); } -void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const PadStrideInfo &deconv_info) +void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *bias, + ICLTensor *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(), - weights->info(), - bias != nullptr ? bias->info() : nullptr, - output->info(), - deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate( + input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info); _original_weights = weights; - _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0; - _is_nchw = input->info()->data_layout() == DataLayout::NCHW; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || + deconv_info.pad_top() > 0; + _is_nchw = input->info()->data_layout() == DataLayout::NCHW; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; @@ -245,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context // do an outer product in NCHW and then an accumulation through a reduction. This would have two // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction // might be slower than GEMM. - if(_is_nchw) + if (_is_nchw) { _memory_group.manage(&_permuted_input); _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U)); @@ -257,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Reshape the input weights. The weights will be reshaped only once during the call to prepare() - _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0), - weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)), - 1, - input->info()->data_type(), weights->info()->quantization_info())); + _reshaped_weights.allocator()->init( + TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) * + weights_to_use->info()->dimension(2) * + weights_to_use->info()->dimension(3)), + 1, input->info()->data_type(), weights->info()->quantization_info())); _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights); _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t); @@ -269,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true); // Configure output stage for asymmetric quantized types - if(_is_quantized) + if (_is_quantized) { // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original // and restore them back to make it work properly. QuantizationInfo iq_info = input->info()->quantization_info(); QuantizationInfo wq_info = weights->info()->quantization_info(); - input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); - _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); + input_to_use->info()->set_quantization_info( + QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset)); + _reshaped_weights_t.info()->set_quantization_info( + QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset)); _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info); @@ -286,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } else { - _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info); + _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, + gemm_info); } - if(_is_nchw) + if (_is_nchw) { _permuted_input.allocator()->allocate(); } @@ -298,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context ICLTensor *slice_output = nullptr; ICLTensor *output_stage_output = nullptr; - if(_padded_input && _is_quantized) + if (_padded_input && _is_quantized) { _memory_group.manage(&_slice_gemm_input); _memory_group.manage(&_gemmlowp_final); @@ -306,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context output_stage_output = &_slice_gemm_input; slice_output = output; } - else if(_padded_input) + else if (_padded_input) { _memory_group.manage(&_slice_gemm_input); deconv_reshape_output = &_slice_gemm_input; slice_output = output; } - else if(_is_quantized) + else if (_is_quantized) { _memory_group.manage(&_gemmlowp_final); deconv_reshape_output = &_gemmlowp_final; @@ -324,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context } // Configure a Col2Im call to reshape the output of GEMM - _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info); + _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), + weights->info(), deconv_info); _gemm_output.allocator()->allocate(); - if(_is_quantized) + if (_is_quantized) { GEMMLowpOutputStageInfo output_stage_info; construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info); - _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info); + _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, + output_stage_info); _gemmlowp_final.allocator()->allocate(); } // If the input was padded, the output needs to be sliced. - if(_padded_input) + if (_padded_input) { - const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); + const auto start_end = + compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw); _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second); _slice_gemm_input.allocator()->allocate(); } @@ -350,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_is_nchw) + if (_is_nchw) { _permute_input_to_nhwc.run(); } - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp.run(); } @@ -366,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run() CLScheduler::get().enqueue(*_deconv_reshape, false); - if(_is_quantized) + if (_is_quantized) { _gemmlowp_output_stage.run(); } - if(_padded_input) + if (_padded_input) { _slice_gemm.run(); } @@ -379,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run() void CLGEMMDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->allocate(); _permute_weights_to_nhwc.run(); @@ -392,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare() _reshaped_weights.allocator()->allocate(); _reshape_weights.run(); - if(_is_nchw) + if (_is_nchw) { _permuted_weights.allocator()->free(); } @@ -401,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare() _transpose_weights.run(); // Prepare gemm - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm.prepare(); } @@ -411,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare() } // Free resources - if(!_reshaped_weights_t.is_used()) + if (!_reshaped_weights_t.is_used()) { _reshaped_weights_t.allocator()->free(); } diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index d9029478a1..8bad198658 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -31,12 +31,12 @@ #include "arm_compute/core/Log.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/IMemoryManager.h" -#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" namespace arm_compute @@ -46,13 +46,13 @@ using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore; struct CLGEMMLowpMatrixMultiplyCore::Impl { - const ICLTensor *b{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ICLTensor *b{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; MemoryRequirements aux_mem_req{}; WorkspaceData<CLTensor> workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager) @@ -63,12 +63,18 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default; -void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) +void CLGEMMLowpMatrixMultiplyCore::configure( + const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) { configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info); } -void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info) +void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, + const ICLTensor *a, + const ICLTensor *b, + const ICLTensor *c, + ICLTensor *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -76,23 +82,29 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _impl->op = std::make_unique<OperatorType>(); _impl->is_prepared = gemm_info.retain_internal_weights(); - _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info); + _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), + gemm_info); _impl->aux_mem_req = _impl->op->workspace(); // Manage/allocate auxilairy tensors - if(_impl->is_prepared) + if (_impl->is_prepared) { _impl->run_pack.add_const_tensor(ACL_SRC_0, a); _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } }; - _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } } -Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { return OperatorType::validate(a, b, c, output, gemm_info); } @@ -108,7 +120,7 @@ void CLGEMMLowpMatrixMultiplyCore::run() void CLGEMMLowpMatrixMultiplyCore::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index 6feed0d713..3dd8c5f101 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -40,27 +40,33 @@ namespace arm_compute { struct CLGEMMLowpOutputStage::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *bias{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClGemmLowpOutputStage> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *bias{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr}; ITensorPack run_pack{}; }; -CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() - : _impl(std::make_unique<Impl>()) +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>()) { } -CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default; CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default; -void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); } -void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) +void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -69,11 +75,15 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c _impl->dst = output; _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>(); - _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info); - _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } }; + _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), + info); + _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}}; } -Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info); } diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp index 033c117cec..2610cb1a3b 100644 --- a/src/runtime/CL/functions/CLGather.cpp +++ b/src/runtime/CL/functions/CLGather.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLGather.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/kernels/CLGatherKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLGatherKernel.h" namespace arm_compute { @@ -35,7 +35,11 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGather::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis); auto k = std::make_unique<CLGatherKernel>(); diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp index 9cb7d618cf..b2c1d2631e 100644 --- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp +++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp @@ -27,13 +27,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" #include "src/core/CL/kernels/CLPadLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager) @@ -71,48 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default; -void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals, +void CLGenerateProposalsLayer::configure(const ICLTensor *scores, + const ICLTensor *deltas, + const ICLTensor *anchors, + ICLTensor *proposals, + ICLTensor *scores_out, + ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); + configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, + num_valid_proposals, info); } -void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, - ICLTensor *scores_out, - ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info) +void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *scores, + const ICLTensor *deltas, + const ICLTensor *anchors, + ICLTensor *proposals, + ICLTensor *scores_out, + ICLTensor *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); - ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); - _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -126,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -140,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context CLTensor *anchors_to_use = &_all_anchors; CLTensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -163,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -183,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -195,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context _num_valid_proposals = num_valid_proposals; _memory_group.manage(&_proposals_4_roi_values); - _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, - BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height())); + _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, + &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals, + BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height())); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); _all_proposals_to_use->allocator()->allocate(); _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -216,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -227,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -309,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -356,7 +412,7 @@ void CLGenerateProposalsLayer::run() CLScheduler::get().enqueue(*_compute_anchors_kernel, false); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -364,7 +420,7 @@ void CLGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors->run(); _dequantize_deltas->run(); @@ -373,7 +429,7 @@ void CLGenerateProposalsLayer::run() // Build the boxes CLScheduler::get().enqueue(*_bounding_box_kernel, false); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals->run(); } diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp index 90af36aa77..1a2369c5c2 100644 --- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp @@ -26,36 +26,45 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/gpu/cl/operators/ClIndirectConv2d.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClIndirectConv2d.h" namespace arm_compute { struct CLIndirectConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClIndirectConv2d> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr}; }; -CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() - : _impl(std::make_unique<Impl>()) +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>()) { } -CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; +CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default; CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default; CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer() = default; -void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLIndirectConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info); } -void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info); @@ -65,10 +74,15 @@ void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_conte _impl->biases = biases; _impl->dst = output; _impl->op = std::make_unique<opencl::ClIndirectConv2d>(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info); } -Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info); @@ -83,4 +97,4 @@ void CLIndirectConvolutionLayer::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp index 5feafe19db..0e994e1aee 100644 --- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp @@ -27,50 +27,62 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT - : _inst_norm_kernel(), - _mean_var_kernel(), - _mean_var_tensor(), - _ctx(ctx) + : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx) { } CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer() { } -void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure( + ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) { configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision); } -void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision); auto w = std::make_unique<CLComputeMeanVariance>(); w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision); _mean_var_kernel = std::move(w); auto k = std::make_unique<CLInstanceNormalizationLayerKernel>(); - k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + k->configure(compile_context, input, &_mean_var_tensor, output, + InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); _inst_norm_kernel = std::move(k); _mean_var_tensor.allocator()->allocate(); } -Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision) +Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision) { - return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); + return CLInstanceNormalizationLayerKernel::validate( + input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision)); } void CLInstanceNormalizationLayer::run() { - ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured"); + ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, + "The child class didn't set the CL kernel or function isn't configured"); schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get()); schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get()); } diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp index 1278385f53..4fe1d9b20b 100644 --- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp +++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace @@ -57,7 +57,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon); } -void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayer::configure( + const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon) { ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon); @@ -86,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index ea08beca75..3b50234c77 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -24,15 +24,15 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/gpu/cl/kernels/ClTransposeKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/kernels/ClTransposeKernel.h" namespace arm_compute { @@ -40,54 +40,155 @@ using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::utils::info_helpers; CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), - _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), - _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), - _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), - _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), - _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), - _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), - _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), - _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), - _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), - _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false) + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _ones_fill(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), + _is_layer_norm_lstm(false) { } CLLSTMLayer::~CLLSTMLayer() = default; -void CLLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams<ICLTensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, + cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, cell_threshold, projection_threshold); } -void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - const ICLTensor *output_state_in, ICLTensor *cell_state_in, - ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output, - const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void CLLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + const ICLTensor *output_state_in, + ICLTensor *cell_state_in, + ICLTensor *scratch_buffer, + ICLTensor *output_state_out, + ICLTensor *cell_state_out, + ICLTensor *output, + const LSTMParams<ICLTensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); - ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, - output, lstm_params, activation_info, cell_threshold, projection_threshold); + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -96,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); // Configure block that calculates the forget gate @@ -126,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe weights_vector.emplace_back(input_to_forget_weights); weights_vector.emplace_back(recurrent_to_forget_weights); - const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); + const TensorShape weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); CLTensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_forget_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -154,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, + lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, + &_forget_layer_norm_out2, ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -178,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); CLTensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type())); - _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); + _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, + ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; } @@ -195,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector<const ICLTensor *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX); @@ -203,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out3); - _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_input_gate_out4, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); + _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -221,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(compile_context, input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, + lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(compile_context, input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -249,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, + (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info()); _recurrent_to_cell_weights = recurrent_to_cell_weights; _memory_group.manage(&_cell_state_out3); - _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f); + _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, + 0.f); _cell_state_out2.allocator()->allocate(); _memory_group.manage(&_cell_state_out4); - _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); + _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, + ConvertPolicy::SATURATE); CLTensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, + lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, + ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold)); + _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -298,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe std::vector<const ICLTensor *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX); @@ -306,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, + (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); CLTensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), + &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -329,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(compile_context, output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, + lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, + &_output_layer_norm_out2, ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(compile_context, output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -361,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, + output_state_out_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN); _cell_state_activation.allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -383,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe // Vector for holding the tensors to store in scratch buffer std::vector<const ICLTensor *> scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -397,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe output_gate_out->allocator()->allocate(); } -Status CLLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status CLLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams<ITensorInfo> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -438,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -459,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -470,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -488,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type()); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); std::vector<const ITensorInfo *> inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -525,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector<const ITensorInfo *> lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, - -cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } std::vector<const ITensorInfo *> in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); // Validate output gate tmp - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_NEAREST_EVEN)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); - if(lstm_params.has_projection()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, + 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_NEAREST_EVEN)); + if (lstm_params.has_projection()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -616,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector<const ITensorInfo *> inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -638,12 +829,12 @@ void CLLSTMLayer::run() _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -651,7 +842,7 @@ void CLLSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { _ones_fill.run(); _subtract_input_gate.run(); @@ -660,13 +851,13 @@ void CLLSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -679,12 +870,10 @@ void CLLSTMLayer::run() ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights); pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2); - CLScheduler::get().enqueue_op(*_transpose_cell_state, - pack, - false); + CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); @@ -695,19 +884,19 @@ void CLLSTMLayer::run() _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -718,10 +907,10 @@ void CLLSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -735,10 +924,10 @@ void CLLSTMLayer::run() void CLLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index d14c6102d5..ea64eda023 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -25,12 +25,12 @@ #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/helpers/AutoConfiguration.h" +#include "arm_compute/core/Validate.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/AutoConfiguration.h" #include <memory> @@ -46,48 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit } // namespace CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(), - _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), - _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), - _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), - _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), - _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(), - _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false) + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add_cell_state_tmps(), + _add2(), + _mul_forget_gate_cell_state(), + _mul_input_gate_input_mod_gate(), + _mul_output_state_tmp_output_gate(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state_tmp1(), + _cell_state_tmp2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), + _is_prepared(false) { } void CLLSTMLayerQuantized::configure(const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, - output_state_out); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); } -void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, const ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out) +void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_input_weights, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + const ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -95,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -124,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY); std::vector<const ICLTensor *> weights_vector; weights_vector.emplace_back(&_recurrent_weights); weights_vector.emplace_back(&_input_weights); - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX); _transpose_weights.configure(compile_context, &_weights, &_weights_transposed); @@ -144,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co input_vector.emplace_back(output_state_in); _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX); // Bias concatenation @@ -159,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -169,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -191,85 +280,111 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0}, + {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, + {2 * output_size, 0}, {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size}, + {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size}, + {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state_tmp1); - _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state_tmp2); - _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state_tmp2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, + &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); - _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); + _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, + ConvertPolicy::SATURATE); _cell_state_tmp1.allocator()->allocate(); _cell_state_tmp2.allocator()->allocate(); // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, + &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -278,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co } Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8); const int input_size = input->dimension(0); @@ -299,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -343,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector<const ITensorInfo *> weights_vector; @@ -353,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -379,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -390,7 +542,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage GEMMLowpOutputStageInfo info{}; @@ -405,68 +558,91 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -475,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -541,7 +717,7 @@ void CLLSTMLayerQuantized::run() void CLLSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp index 696191c485..ea21c54bc3 100644 --- a/src/runtime/CL/functions/CLLogicalAnd.cpp +++ b/src/runtime/CL/functions/CLLogicalAnd.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include <utility> @@ -33,7 +34,10 @@ namespace arm_compute { namespace experimental { -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>(); @@ -54,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors) struct CLLogicalAnd::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLLogicalAnd> op{nullptr}; }; -CLLogicalAnd::CLLogicalAnd() - : _impl(std::make_unique<Impl>()) +CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>()) { } -CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; +CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default; CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default; CLLogicalAnd::~CLLogicalAnd() = default; @@ -73,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalAnd::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp index a0504d7852..71f9cce54f 100644 --- a/src/runtime/CL/functions/CLLogicalNot.cpp +++ b/src/runtime/CL/functions/CLLogicalNot.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClLogicalNot.h" @@ -32,16 +33,15 @@ namespace arm_compute { struct CLLogicalNot::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClLogicalNot> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClLogicalNot> op{nullptr}; }; -CLLogicalNot::CLLogicalNot() - : _impl(std::make_unique<Impl>()) +CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>()) { } -CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; +CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default; CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default; CLLogicalNot::~CLLogicalNot() = default; @@ -72,4 +72,4 @@ void CLLogicalNot::run() _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp index f9a606e8a5..3db4fdae84 100644 --- a/src/runtime/CL/functions/CLLogicalOr.cpp +++ b/src/runtime/CL/functions/CLLogicalOr.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLLogicalOr.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" #include <utility> @@ -33,7 +34,10 @@ namespace arm_compute { namespace experimental { -void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output); auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>(); @@ -54,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors) struct CLLogicalOr::Impl { - const ICLTensor *src0{ nullptr }; - const ICLTensor *src1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<experimental::CLLogicalOr> op{ nullptr }; + const ICLTensor *src0{nullptr}; + const ICLTensor *src1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLLogicalOr> op{nullptr}; }; -CLLogicalOr::CLLogicalOr() - : _impl(std::make_unique<Impl>()) +CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>()) { } -CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; +CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default; CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default; CLLogicalOr::~CLLogicalOr() = default; @@ -73,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +void CLLogicalOr::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output) { _impl->src0 = input1; _impl->src1 = input2; diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp index bef422fca1..e8bdad706b 100644 --- a/src/runtime/CL/functions/CLMatMul.cpp +++ b/src/runtime/CL/functions/CLMatMul.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLMatMul.h" + #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTypes.h" + #include "src/gpu/cl/operators/ClMatMul.h" namespace arm_compute @@ -32,23 +34,32 @@ using OperatorType = opencl::ClMatMul; struct CLMatMul::Impl { - std::unique_ptr<OperatorType> op{ nullptr }; + std::unique_ptr<OperatorType> op{nullptr}; ITensorPack run_pack{}; }; -CLMatMul::CLMatMul() - : _impl(std::make_unique<Impl>()) +CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>()) { } CLMatMul::~CLMatMul() = default; -void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void CLMatMul::configure(ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(settings); configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info); } -void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, +void CLMatMul::configure(const CLCompileContext &compile_context, + ICLTensor *lhs, + ICLTensor *rhs, + ICLTensor *output, + const MatMulInfo &matmul_info, + const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); @@ -56,10 +67,14 @@ void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs _impl->op = std::make_unique<OperatorType>(); _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info); - _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; } -Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +Status CLMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { return OperatorType::validate(lhs, rhs, output, matmul_info, act_info); } diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp index 2786d32d33..7494f379b9 100644 --- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp +++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp @@ -27,26 +27,32 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" namespace arm_compute { CLMaxUnpoolingLayer::CLMaxUnpoolingLayer() - : _fill(), - _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>()) + : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>()) { } CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default; -void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info); } -void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); const PixelValue zero_value(0.f); @@ -55,7 +61,10 @@ void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICL _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info); } -Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info); } diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp index a81cbca1b0..5892c0e840 100644 --- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { ARM_COMPUTE_LOG_PARAMS(input, output, epsilon); auto k = std::make_unique<CLMeanStdDevNormalizationKernel>(); diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index c0cc5184e6..f93f82f1a2 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLNormalizationLayerKernel.h" namespace arm_compute { @@ -50,7 +50,10 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) +void CLNormalizationLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); ARM_COMPUTE_LOG_PARAMS(input, output, norm_info); @@ -58,21 +61,24 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC // Configure normalization kernel _norm_kernel->configure(compile_context, input, output, norm_info); - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel - _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, + PixelValue()); } } -Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status CLNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { return CLNormalizationLayerKernel::validate(input, output, norm_info); } void CLNormalizationLayer::run() { - if(!_norm_kernel->border_size().empty()) + if (!_norm_kernel->border_size().empty()) { // Run border handler CLScheduler::get().enqueue(*_border_handler, false); diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp index 63c9164a94..939c95bd45 100644 --- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp @@ -24,20 +24,26 @@ #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h" -#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include <utility> namespace arm_compute { -void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, std); auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>(); @@ -45,8 +51,10 @@ void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_contex _kernel = std::move(k); } -Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std); } diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index 186e7b4ba2..ce6d285ebe 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPReluLayer.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/operators/ClPRelu.h" @@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu; struct CLPReluLayer::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; }; -CLPReluLayer::CLPReluLayer() - : _impl(std::make_unique<Impl>()) +CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>()) { } -CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; +CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default; CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default; CLPReluLayer::~CLPReluLayer() = default; @@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output); } -void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +void CLPReluLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *alpha, + ICLTensor *output) { _impl->src_0 = input; _impl->src_1 = alpha; _impl->dst = output; _impl->op = std::make_unique<OperatorType>(); - _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info())); + _impl->op->configure(compile_context, input->info(), alpha->info(), + (output == nullptr ? input->info() : output->info())); } Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp index 0ed8f03d64..e788ded512 100644 --- a/src/runtime/CL/functions/CLPadLayer.cpp +++ b/src/runtime/CL/functions/CLPadLayer.cpp @@ -22,37 +22,38 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLPadLayer.h" -#include "src/core/CL/kernels/CLPadLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLPadLayerKernel.h" namespace arm_compute { -CLPadLayer::CLPadLayer() - : _pad_kernel(std::make_unique<CLPadLayerKernel>()), - _copy(), - _perform_pad(false) +CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false) { } CLPadLayer::~CLPadLayer() = default; -void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure( + ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); - _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + _perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(_perform_pad) + if (_perform_pad) { _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); } @@ -62,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i _copy.configure(compile_context, input, output); } } -Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { - bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) - { - return info.first > 0 || info.second > 0; - }); + bool perform_pad = + std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); - if(perform_pad) + if (perform_pad) { ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode)); } @@ -81,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, } void CLPadLayer::run() { - if(_perform_pad) + if (_perform_pad) { CLScheduler::get().enqueue(*_pad_kernel); } diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp index a56afff7df..7f97eed98a 100644 --- a/src/runtime/CL/functions/CLPermute.cpp +++ b/src/runtime/CL/functions/CLPermute.cpp @@ -27,22 +27,21 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/gpu/cl/operators/ClPermute.h" #include "src/common/utils/Log.h" +#include "src/core/CL/ICLKernel.h" +#include "src/gpu/cl/operators/ClPermute.h" namespace arm_compute { struct CLPermute::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClPermute> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClPermute> op{nullptr}; }; -CLPermute::CLPermute() - : _impl(std::make_unique<Impl>()) +CLPermute::CLPermute() : _impl(std::make_unique<Impl>()) { } @@ -53,7 +52,10 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu configure(CLKernelLibrary::get().get_compile_context(), input, output, perm); } -void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm) +void CLPermute::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PermutationVector &perm) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, perm); diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index 9d91e58367..6aa9d9cbb3 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClMul.h" @@ -34,38 +35,55 @@ namespace arm_compute { struct CLPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClMul> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClMul> op{nullptr}; }; -CLPixelWiseMultiplication::CLPixelWiseMultiplication() - : _impl(std::make_unique<Impl>()) +CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) { } -CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; +CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default; CLPixelWiseMultiplication::~CLPixelWiseMultiplication() = default; -void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, + rounding_policy, act_info); } -void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique<opencl::ClMul>(); - _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, + rounding_policy, act_info); } -Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } @@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run() struct CLComplexPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClComplexMul> op{ nullptr }; + const ICLTensor *src_0{nullptr}; + const ICLTensor *src_1{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClComplexMul> op{nullptr}; }; -CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() - : _impl(std::make_unique<Impl>()) +CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>()) { } CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; -CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; - -void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +CLComplexPixelWiseMultiplication & +CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default; +CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default; + +void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info); } -void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, + ICLTensor *input1, + ICLTensor *input2, + ICLTensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; @@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } -Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return opencl::ClComplexMul::validate(input1, input2, output, act_info); } diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp index 11ae1d0fe6..ce1092a7cc 100644 --- a/src/runtime/CL/functions/CLPooling3dLayer.cpp +++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClPool3d.h" @@ -32,14 +33,13 @@ namespace arm_compute { struct CLPooling3dLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr<opencl::ClPool3d> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr<opencl::ClPool3d> op{nullptr}; }; -CLPooling3dLayer::CLPooling3dLayer() - : _impl(std::make_unique<Impl>()) +CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>()) { } CLPooling3dLayer::~CLPooling3dLayer() = default; @@ -49,7 +49,10 @@ void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, cons configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info); } -void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info) +void CLPooling3dLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Pooling3dLayerInfo &pool_info) { _impl->src = input; _impl->dst = output; @@ -58,7 +61,8 @@ void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const _impl->op->configure(compile_context, input->info(), output->info(), pool_info); } -Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +Status +CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) { return opencl::ClPool3d::validate(input, output, pool_info); } diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp index 0ebce318fa..65e53b9be3 100644 --- a/src/runtime/CL/functions/CLPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLPoolingLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClPool2d.h" @@ -32,34 +33,44 @@ namespace arm_compute { struct CLPoolingLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - ICLTensor *indices{ nullptr }; - std::unique_ptr<opencl::ClPool2d> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + ICLTensor *indices{nullptr}; + std::unique_ptr<opencl::ClPool2d> op{nullptr}; }; -CLPoolingLayer::CLPoolingLayer() - : _impl(std::make_unique<Impl>()) +CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>()) { } CLPoolingLayer::~CLPoolingLayer() = default; -void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices); } -void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices) +void CLPoolingLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const PoolingLayerInfo &pool_info, + ICLTensor *indices) { _impl->src = input; _impl->dst = output; _impl->indices = indices; _impl->op = std::make_unique<opencl::ClPool2d>(); - _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); + _impl->op->configure(compile_context, input->info(), output->info(), pool_info, + (indices) ? indices->info() : nullptr); } -Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CLPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return opencl::ClPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp index 019f0a7e61..cfd0ec4fbf 100644 --- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp +++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp @@ -29,31 +29,40 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h" using namespace arm_compute; -CLPriorBoxLayer::CLPriorBoxLayer() - : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) +CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr) { } -void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info); } -void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info) +void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); - _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float)); - _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float)); - if(!info.max_sizes().empty()) + _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.min_sizes().size() * sizeof(float)); + _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.aspect_ratios().size() * sizeof(float)); + if (!info.max_sizes().empty()) { - _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float)); + _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + info.max_sizes().size() * sizeof(float)); } auto k = std::make_unique<CLPriorBoxLayerKernel>(); @@ -61,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I _kernel = std::move(k); } -Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return CLPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index 7fbb866fa9..12f6f89290 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -26,29 +26,36 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { using namespace arm_compute::utils::info_helpers; using namespace arm_compute::opencl::kernels; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -78,14 +85,12 @@ void CLQLSTMLayer::TensorCopyKernel::run() _src->map(q, true); _dst->map(q, true); - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); _src->unmap(q); _dst->unmap(q); @@ -104,7 +109,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) _layer_norms(), _copy_output() { - for(auto &norm : _layer_norms) + for (auto &norm : _layer_norms) { norm = std::make_unique<CLQLSTMLayerNormalizationKernel>(); } @@ -129,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } -void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, - CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, + CLGEMMLowpMatrixMultiplyCore &mm, + CLGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ICLTensor *mm_input, + const ICLTensor *mm_weights, + const ICLTensor *bias, + CLTensor *mm_res, + CLTensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -151,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void CLQLSTMLayer::configure(const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params) { - configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, - cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params); + configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, + output_state_in, cell_state_out, output_state_out, output, lstm_params); } -void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, - const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, - const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, - const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, - ICLTensor *cell_state_in, ICLTensor *output_state_in, - ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output, +void CLQLSTMLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *input_to_forget_weights, + const ICLTensor *input_to_cell_weights, + const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_forget_weights, + const ICLTensor *recurrent_to_cell_weights, + const ICLTensor *recurrent_to_output_weights, + const ICLTensor *forget_gate_bias, + const ICLTensor *cell_bias, + const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, + ICLTensor *output_state_in, + ICLTensor *cell_state_out, + ICLTensor *output_state_out, + ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, @@ -191,11 +222,11 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info)); const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -216,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -238,53 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), + _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } - _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true)); - _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), + _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), + _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { - _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction->configure( + compile_context, _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, + &_projection_eff_bias, ConvertPolicy::SATURATE); } } // Pre-transpose weights to be used in GEMM. - _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed); - _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed); - _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); - _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, + &_input_to_forget_weights_transposed); + _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, + &_input_to_cell_weights_transposed); + _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, + &_input_to_output_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, + &_recurrent_to_cell_weights_transposed); + _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed); } @@ -297,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); - _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, + &_cell_to_forget_outstage_res, gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, + &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res); _recurrent_to_forget_outstage_res.allocator()->allocate(); @@ -345,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, - mm_out_info, cell_outstage_info); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); - - _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, + &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res, + &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); + + _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res); _recurrent_to_cell_outstage_res.allocator()->allocate(); @@ -378,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -393,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, - ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, + &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, + &_cell_to_input_outstage_res, gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } CLTensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res); _recurrent_to_input_outstage_res.allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); - _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); + _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, + ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(compile_context, cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); - _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, + &_cell_to_output_outstage_res, gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, + _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, + &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } CLTensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res); _recurrent_to_output_outstage_res.allocator()->allocate(); @@ -503,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -525,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -536,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ICLTensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -565,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, + accumulate_destination, ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip)); + _projection_clip.configure(compile_context, output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -600,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _copy_output.configure(compile_context, output_state_out, output); } -Status CLQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status CLQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams<ITensorInfo> &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -622,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -647,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -674,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -682,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, - &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_forget_weights->data_type(), + input_to_forget_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); // Validate weights transpose ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed)); @@ -717,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -738,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -770,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -791,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + &input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -877,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); - ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -980,14 +1210,14 @@ void CLQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget)); } @@ -1002,7 +1232,7 @@ void CLQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell)); } @@ -1010,7 +1240,7 @@ void CLQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -1022,14 +1252,14 @@ void CLQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input)); } @@ -1041,7 +1271,7 @@ void CLQLSTMLayer::run() _pixelwise_mul_forget_cell.run(); _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1052,14 +1282,14 @@ void CLQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output)); } @@ -1072,31 +1302,31 @@ void CLQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1108,7 +1338,7 @@ void CLQLSTMLayer::run() void CLQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Pre-transpose weights to be used in GEMM. _input_to_forget_weights_transposed.allocator()->allocate(); @@ -1125,10 +1355,11 @@ void CLQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { _ones.map(true); - std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); _ones.unmap(); } else @@ -1136,10 +1367,12 @@ void CLQLSTMLayer::prepare() _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } }; + ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights}, + {ACL_DST, &_input_to_input_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); - ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } }; + ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights}, + {ACL_DST, &_recurrent_to_input_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); @@ -1156,30 +1389,35 @@ void CLQLSTMLayer::prepare() _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } }; + ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights}, + {ACL_DST, &_input_to_forget_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); - ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } }; + ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights}, + {ACL_DST, &_recurrent_to_forget_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); - ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } }; + ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); - ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } }; + ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights}, + {ACL_DST, &_recurrent_to_cell_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); - ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } }; + ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights}, + {ACL_DST, &_input_to_output_eff_bias}}; CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); - ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } }; + ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights}, + {ACL_DST, &_recurrent_to_output_eff_bias}}; CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); - if(_has_projection) + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } }; + ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}}; CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); - if(_projection_bias != nullptr) + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1189,7 +1427,7 @@ void CLQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp index b249bdd1db..6edef29992 100644 --- a/src/runtime/CL/functions/CLQuantizationLayer.cpp +++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClQuantize.h" @@ -32,13 +33,12 @@ namespace arm_compute { struct CLQuantizationLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClQuantize> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClQuantize> op{nullptr}; }; -CLQuantizationLayer::CLQuantizationLayer() - : _impl(std::make_unique<Impl>()) +CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>()) { } CLQuantizationLayer::~CLQuantizationLayer() = default; diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 6f122866b2..34b78eefa7 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -28,24 +28,37 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_kernel(), + _activation(), + _fully_connected_kernel(), + _copy(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } CLRNNLayer::~CLRNNLayer() = default; -Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status CLRNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -63,28 +76,42 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, +void CLRNNLayer::configure(const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, + output, info); } -void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, - ICLTensor *hidden_state, - ICLTensor *output, ActivationLayerInfo &info) +void CLRNNLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *recurrent_weights, + const ICLTensor *bias, + ICLTensor *hidden_state, + ICLTensor *output, + ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); @@ -133,7 +160,7 @@ void CLRNNLayer::run() void CLRNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected_kernel.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp index 867ef7c7ac..1939d1d0ba 100644 --- a/src/runtime/CL/functions/CLROIAlignLayer.cpp +++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp @@ -24,26 +24,36 @@ #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h" #include "arm_compute/core/CL/ICLArray.h" -#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" -#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIAlignLayerKernel.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" namespace arm_compute { -Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp index 239a1c6bb2..0d2eab0c76 100644 --- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp +++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp @@ -22,24 +22,35 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" + #include "arm_compute/core/CL/ICLArray.h" -#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h" using namespace arm_compute; -Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp index 3fbbd5f952..5c3f7f9c8c 100644 --- a/src/runtime/CL/functions/CLRange.cpp +++ b/src/runtime/CL/functions/CLRange.cpp @@ -27,9 +27,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLRangeKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLRangeKernel.h" using namespace arm_compute; @@ -38,7 +38,8 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRange::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { ARM_COMPUTE_LOG_PARAMS(output, start, end, step); auto k = std::make_unique<CLRangeKernel>(); diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index cddbf77d7c..6c6daff5ba 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -27,23 +27,25 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -51,29 +53,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -87,8 +89,9 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); - if(requant) + const bool requant = + is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info(); + if (requant) { TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32)); CLDequantizationLayer::validate(input, &input_no_quant); @@ -98,10 +101,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax } return Status{}; } -} +} // namespace CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(), + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _dequant(), + _requant(), + _reduction_ops(), + _keep_dims(), + _do_requant(), + _input_no_quant(), _output_no_quant() { } @@ -111,17 +123,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output); } -void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) +void CLReduceMean::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const Coordinates &reduction_axis, + bool keep_dims, + ICLTensor *output) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info(); + _do_requant = is_data_type_quantized(input->info()->data_type()) && + input->info()->quantization_info() != output->info()->quantization_info(); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); @@ -129,7 +147,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor ICLTensor *tmp_input = input; ICLTensor *tmp_output = output; - if(_do_requant) + if (_do_requant) { _memory_group.manage(&_input_no_quant); _memory_group.manage(&_output_no_quant); @@ -148,46 +166,51 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { - _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], + ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), + tmp_input->info()->data_type(), + tmp_input->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); - _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); + _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], + ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!_keep_dims) + if (!_keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); } auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape)); _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output); } - if(_do_requant) + if (_do_requant) { _requant.configure(compile_context, &_output_no_quant, output); _input_no_quant.allocator()->allocate(); @@ -195,7 +218,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor } } -Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status CLReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -204,19 +230,19 @@ void CLReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_requant) + if (_do_requant) { _dequant.run(); } - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } - if(_do_requant) + if (_do_requant) { _requant.run(); } diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index cdc7fec51b..ba5489018e 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -27,35 +27,43 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/Utils.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false) + : _memory_group(std::move(memory_manager)), + _unreshaped_output(), + _reduction_kernel(), + _reshape(), + _reduction_axis(), + _is_reshape_required(false) { } CLReductionOperation::~CLReductionOperation() = default; -Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status CLReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const bool is_reshape_required = !keep_dims; - if(is_reshape_required && output->total_size() != 0) + if (is_reshape_required && output->total_size() != 0) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); } @@ -67,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_qinfo = input->quantization_info(); const auto output_data_type = output->data_type(); - auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo) - { + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels, + QuantizationInfo qinfo) { ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo); }; - if(is_reshape_required) + if (is_reshape_required) { auto shape_before_reshape = input_shape; shape_before_reshape.set(axis, 1); - initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo); + initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, + input_qinfo); output_internal = &output_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output)); } @@ -92,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output) { - if(!_is_reshape_required) + if (!_is_reshape_required) { return output; } @@ -103,12 +112,18 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor return &_unreshaped_output; } -void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure( + ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims); } -void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void CLReductionOperation::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op, + bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); @@ -117,11 +132,17 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC auto *output_internal = configure_intermediate_result_vector(input, output); - if(_is_reshape_required) + if (_is_reshape_required) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = input->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); _memory_group.manage(&_unreshaped_output); } @@ -129,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC _reduction_kernel = std::make_unique<CLReductionOperationKernel>(); _reduction_kernel->configure(compile_context, input, output_internal, axis, op); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(compile_context, &_unreshaped_output, output); _unreshaped_output.allocator()->allocate(); @@ -142,7 +163,7 @@ void CLReductionOperation::run() CLScheduler::get().enqueue(*_reduction_kernel, false); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp index 15de959225..156e9b90c1 100644 --- a/src/runtime/CL/functions/CLReorgLayer.cpp +++ b/src/runtime/CL/functions/CLReorgLayer.cpp @@ -27,9 +27,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReorgLayerKernel.h" #include <utility> @@ -40,7 +40,10 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + int32_t stride) { ARM_COMPUTE_LOG_PARAMS(input, output, stride); auto k = std::make_unique<CLReorgLayerKernel>(); diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp index c51a3298c1..3d6349fb25 100644 --- a/src/runtime/CL/functions/CLReshapeLayer.cpp +++ b/src/runtime/CL/functions/CLReshapeLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClReshape.h" @@ -35,17 +36,16 @@ namespace arm_compute { struct CLReshapeLayer::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClReshape> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClReshape> op{nullptr}; }; -CLReshapeLayer::CLReshapeLayer() - : _impl(std::make_unique<Impl>()) +CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>()) { } -CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; +CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default; CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default; CLReshapeLayer::~CLReshapeLayer() = default; @@ -78,4 +78,4 @@ void CLReshapeLayer::run() _impl->op->run(pack); } } // namespace arm_compute -/** [CLReshapeLayer snippet] **/
\ No newline at end of file + /** [CLReshapeLayer snippet] **/ diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp index 1fc93571d9..415de52e64 100644 --- a/src/runtime/CL/functions/CLReverse.cpp +++ b/src/runtime/CL/functions/CLReverse.cpp @@ -24,9 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLReverse.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLReverseKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLReverseKernel.h" namespace arm_compute { @@ -35,7 +35,10 @@ void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTe configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); } -void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverse::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis) { ARM_COMPUTE_LOG_PARAMS(input, output, axis); auto k = std::make_unique<CLReverseKernel>(); diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index 5b78989bfa..abff0724e4 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClScale.h" @@ -33,13 +34,12 @@ namespace arm_compute { struct CLScale::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClScale> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClScale> op{nullptr}; }; -CLScale::CLScale() - : _impl(std::make_unique<Impl>()) +CLScale::CLScale() : _impl(std::make_unique<Impl>()) { } CLScale::~CLScale() = default; @@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info) +void CLScale::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ScaleKernelInfo &info) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp index c4ab3dc67a..b4897d9e62 100644 --- a/src/runtime/CL/functions/CLSelect.cpp +++ b/src/runtime/CL/functions/CLSelect.cpp @@ -25,9 +25,9 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSelectKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSelectKernel.h" using namespace arm_compute; @@ -38,7 +38,11 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output); } -void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelect::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(c, x, y, output); auto k = std::make_unique<CLSelectKernel>(); diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp index 7e3ac7d769..f79c6a1235 100644 --- a/src/runtime/CL/functions/CLSlice.cpp +++ b/src/runtime/CL/functions/CLSlice.cpp @@ -26,15 +26,19 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" -#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); @@ -47,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn _kernel = std::move(k); } -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -66,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct CLSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<experimental::CLSlice> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<experimental::CLSlice> op{nullptr}; }; -CLSlice::CLSlice() - : _impl(std::make_unique<Impl>()) +CLSlice::CLSlice() : _impl(std::make_unique<Impl>()) { } -CLSlice::CLSlice(CLSlice &&) = default; +CLSlice::CLSlice(CLSlice &&) = default; CLSlice &CLSlice::operator=(CLSlice &&) = default; CLSlice::~CLSlice() = default; -Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status CLSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::CLSlice::validate(input, output, starts, ends); } @@ -89,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends); } -void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends) +void CLSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends) { _impl->src = input; _impl->dst = output; diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp index d52352fc8d..2e70e2aa08 100644 --- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp +++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" + #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" #include "src/gpu/cl/operators/ClPermute.h" @@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax; template <bool IS_LOG> struct CLSoftmaxLayerGeneric<IS_LOG>::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<OperatorType> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<OperatorType> op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData<CLTensor> workspace_tensors{}; @@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor } template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) +void CLSoftmaxLayerGeneric<IS_LOG>::configure( + const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis) { _impl->src = input; _impl->dst = output; _impl->op = std::make_unique<OperatorType>(); - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis}; _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template <bool IS_LOG> -Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { - SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis }; + SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis}; return OperatorType::validate(*input, *output, softmax_info); } template <bool IS_LOG> -void CLSoftmaxLayerGeneric<IS_LOG>::run() +void CLSoftmaxLayerGeneric<IS_LOG>::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp index 3b7083400b..37f728895f 100644 --- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp @@ -29,71 +29,100 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" namespace arm_compute { CLSpaceToBatchLayer::CLSpaceToBatchLayer() - : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), - _fill(), - _has_padding(false) + : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false) { } CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default; -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output); } -void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, ICLTensor *output) +void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; - _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); + _fill.configure(compile_context, output, + PixelValue(0, input->info()->data_type(), input->info()->quantization_info())); } - _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output); + _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, + output); } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); - ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info()))); + ARM_COMPUTE_RETURN_ON_ERROR( + CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -101,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void CLSpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { //CLScheduler::get().enqueue(*_fill, true); _fill.run(); diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp index 67dafff47f..22695c9ef3 100644 --- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp +++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp @@ -29,14 +29,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" namespace arm_compute { -CLSpaceToDepthLayer::CLSpaceToDepthLayer() - : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>()) +CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>()) { } @@ -47,7 +46,10 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); _space_to_depth_kernel->configure(compile_context, input, output, block_shape); diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp index 0b27371e3f..6be43cc5cd 100644 --- a/src/runtime/CL/functions/CLSplit.cpp +++ b/src/runtime/CL/functions/CLSplit.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" namespace arm_compute @@ -38,7 +39,7 @@ void CLSplit::run() { cl::CommandQueue q = CLScheduler::get().queue(); - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp index 6a335da00c..c15496fc31 100644 --- a/src/runtime/CL/functions/CLStackLayer.cpp +++ b/src/runtime/CL/functions/CLStackLayer.cpp @@ -21,8 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include <complex> - #include "arm_compute/runtime/CL/functions/CLStackLayer.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -32,16 +30,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLStackLayerKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStackLayerKernel.h" + +#include <complex> namespace arm_compute { CLStackLayer::CLStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } @@ -52,7 +50,10 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC configure(CLKernelLibrary::get().get_compile_context(), input, axis, output); } -void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output) +void CLStackLayer::configure(const CLCompileContext &compile_context, + const std::vector<ICLTensor *> &input, + int axis, + ICLTensor *output) { ARM_COMPUTE_LOG_PARAMS(input, axis, output); _num_inputs = input.size(); @@ -61,7 +62,7 @@ void CLStackLayer::configure(const CLCompileContext &compile_context, const std: // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>()); _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output); @@ -79,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -92,7 +93,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, void CLStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { CLScheduler::get().enqueue(*_stack_kernels[i], false); } diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp index 261bdc13d1..c1953cc415 100644 --- a/src/runtime/CL/functions/CLStridedSlice.cpp +++ b/src/runtime/CL/functions/CLStridedSlice.cpp @@ -25,17 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLStridedSliceKernel.h" #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLStridedSliceKernel.h" namespace arm_compute { namespace experimental { -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto k = std::make_unique<CLStridedSliceKernel>(); @@ -43,9 +49,14 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IT _kernel = std::move(k); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -53,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct CLStridedSlice::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - CLRuntimeContext *ctx{ nullptr }; - std::unique_ptr<experimental::CLStridedSlice> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + CLRuntimeContext *ctx{nullptr}; + std::unique_ptr<experimental::CLStridedSlice> op{nullptr}; }; -CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) - : _impl(std::make_unique<Impl>()) +CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>()) { _impl->ctx = ctx; } -CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; +CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default; CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default; CLStridedSlice::~CLStridedSlice() = default; -void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } -void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSlice::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -86,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC _impl->dst = output; _impl->op = std::make_unique<experimental::CLStridedSlice>(); - _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, + end_mask, shrink_axis_mask); } -Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } void CLStridedSlice::run() diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp index ef790995f9..4f86c4adfa 100644 --- a/src/runtime/CL/functions/CLTile.cpp +++ b/src/runtime/CL/functions/CLTile.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/CL/functions/CLTile.h" -#include "src/core/CL/kernels/CLTileKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLTileKernel.h" namespace arm_compute { @@ -34,7 +33,10 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTile::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { ARM_COMPUTE_LOG_PARAMS(input, output, multiples); auto k = std::make_unique<CLTileKernel>(); diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp index e63c92eeb4..5a738f47ce 100644 --- a/src/runtime/CL/functions/CLTranspose.cpp +++ b/src/runtime/CL/functions/CLTranspose.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/ICLKernel.h" #include "src/gpu/cl/operators/ClTranspose.h" @@ -34,12 +35,11 @@ namespace arm_compute { struct CLTranspose::Impl { - const ICLTensor *src{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClTranspose> op{ nullptr }; + const ICLTensor *src{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClTranspose> op{nullptr}; }; -CLTranspose::CLTranspose() - : _impl(std::make_unique<Impl>()) +CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>()) { } CLTranspose::~CLTranspose() = default; @@ -70,4 +70,4 @@ void CLTranspose::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp index 98d47810ab..ddd83e7824 100644 --- a/src/runtime/CL/functions/CLUnstack.cpp +++ b/src/runtime/CL/functions/CLUnstack.cpp @@ -40,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast<int>(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -56,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace CLUnstack::CLUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } @@ -66,15 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis); } -void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis) +void CLUnstack::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const std::vector<ICLTensor *> &output_vector, + int axis) { ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis); std::vector<ITensorInfo *> outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ICLTensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -87,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, (1 << axis_u)); } } @@ -106,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size()); Coordinates slice_start; int32_t slice_end_mask; - for(size_t k = 0; k < num_slices; ++k) + for (size_t k = 0; k < num_slices; ++k) { slice_start.set(wrap_axis(axis, input), k); setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void CLUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index b416d0fcf1..645f817030 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/CL/ICLKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClWinogradConv2d.h" @@ -35,15 +36,15 @@ namespace arm_compute { struct CLWinogradConvolutionLayer::Impl { - const ICLTensor *src{ nullptr }; - const ICLTensor *weights{ nullptr }; - const ICLTensor *biases{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr }; + const ICLTensor *src{nullptr}; + const ICLTensor *weights{nullptr}; + const ICLTensor *biases{nullptr}; + ICLTensor *dst{nullptr}; + std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr}; ITensorPack run_pack{}; MemoryGroup memory_group{}; WorkspaceData<CLTensor> workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) @@ -54,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryMa CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default; -void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, - bool enable_fast_math) +void CLWinogradConvolutionLayer::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, + enable_fast_math); } -void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, +void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) + const ActivationLayerInfo &act_info, + bool enable_fast_math) { _impl->src = input; _impl->weights = weights; @@ -70,20 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte _impl->dst = output; _impl->op = std::make_unique<opencl::ClWinogradConv2d>(); - _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math); + _impl->op->configure(compile_context, input->info(), weights->info(), + (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, + enable_fast_math); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, _impl->src }, - { TensorType::ACL_SRC_1, _impl->weights }, - { TensorType::ACL_SRC_2, _impl->biases }, - { TensorType::ACL_DST, _impl->dst } - }; - _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src}, + {TensorType::ACL_SRC_1, _impl->weights}, + {TensorType::ACL_SRC_2, _impl->biases}, + {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = + manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } @@ -97,7 +114,7 @@ void CLWinogradConvolutionLayer::run() void CLWinogradConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -107,4 +124,4 @@ void CLWinogradConvolutionLayer::prepare() _impl->is_prepared = true; } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp index 18ade97885..4270165ab4 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <map> @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,109 +44,109 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Bifrost architectures - static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G71 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G52 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; // Mali-G76 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = - { - { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 }, - { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = { + {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32}, + {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { case GPUTarget::G71: - if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) + if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end()) { - return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G76: - if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) + if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end()) { - return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G52: - if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) + if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end()) { - return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE; - if(is_rhs_constant) + if (is_rhs_constant) { - if((m > 1) && (n < 16)) + if ((m > 1) && (n < 16)) { gemm_type = CLGEMMKernelType::RESHAPED; } - else if(m == 1) + else if (m == 1) { gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if((k > 256) && (m > 4)) + if ((k > 256) && (m > 4)) { constexpr float alpha = 3.2f; constexpr float fact0 = 1.51f; constexpr float fact1 = 1.66f; constexpr float ops = 12.0f; const float scale = k > 1024 ? 1.07f : 1.0f; - gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::RESHAPED_ONLY_RHS; + gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) + ? CLGEMMKernelType::RESHAPED + : CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { @@ -156,19 +156,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned const auto workload = static_cast<float>((m * n) / 20.0f); - gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED : gemm_type; + gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED + : gemm_type; } return gemm_type; } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -183,11 +185,12 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(k <= 496) + if (k <= 496) { - if(n <= 544) + if (n <= 544) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } else { - if(k <= 588) + if (k <= 588) { - if(k <= 552) + if (k <= 552) { - if(m <= 148) + if (m <= 148) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 278) + if (m <= 278) { return CLGEMMKernelType::RESHAPED; } @@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k)); - if(r_mn <= 1.5469f) + if (r_mn <= 1.5469f) { - if(r_mk <= 0.8766f) + if (r_mk <= 0.8766f) { - if(r_mk <= 0.0211f) + if (r_mk <= 0.0211f) { - if(r_mnk <= 77.5833f) + if (r_mnk <= 77.5833f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.0832f) + if (r_nk <= 0.0832f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mnk <= 193.0000f) + if (r_mnk <= 193.0000f) { - if(r_mn <= 0.9948f) + if (r_mn <= 0.9948f) { - if(r_mk <= 2.5453f) + if (r_mk <= 2.5453f) { return CLGEMMKernelType::RESHAPED; } @@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 17.7370f) + if (r_mn <= 17.7370f) { - if(r_mnk <= 1391.2875f) + if (r_mnk <= 1391.2875f) { - if(r_mk <= 2.9724f) + if (r_mk <= 2.9724f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mnk <= 470.0000f) + if (r_mnk <= 470.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_nk <= 0.1381f) + if (r_nk <= 0.1381f) { - if(r_mnk <= 9040.5000f) + if (r_mnk <= 9040.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } else { - if(r_mn <= 5.6790f) + if (r_mn <= 5.6790f) { return CLGEMMKernelType::RESHAPED; } @@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_nk = static_cast<float>(n) / static_cast<float>(k); - if(k <= 212) + if (k <= 212) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_nk <= 0.4990234375f) + if (r_nk <= 0.4990234375f) { - if(k <= 1392) + if (k <= 1392) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 325) + if (m <= 325) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } else { - if(k <= 471) + if (k <= 471) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(r_mn <= 0.04475911520421505f) + if (r_mn <= 0.04475911520421505f) { return CLGEMMKernelType::RESHAPED; } @@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 127.0000f) + if (n <= 127.0000f) { - if(n <= 63.5000f) + if (n <= 63.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 3616.0000f) + if (m <= 3616.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 2970.5000f) + if (m <= 2970.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 12.5000f) + if (m <= 12.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 104.0000f) + if (k <= 104.0000f) { - if(b <= 18.5000f) + if (b <= 18.5000f) { - if(m <= 490.0000f) + if (m <= 490.0000f) { - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } else { - if(m <= 226.0000f) + if (m <= 226.0000f) { - if(n <= 140.0000f) + if (n <= 140.0000f) { - if(m <= 179.5000f) + if (m <= 179.5000f) { return CLGEMMKernelType::RESHAPED; } @@ -556,15 +563,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); ARM_COMPUTE_UNUSED(n); ARM_COMPUTE_UNUSED(k); - if(is_rhs_constant) + if (is_rhs_constant) { - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp index ef30b28f96..673038a8db 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <map> @@ -35,8 +36,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Configurations for Midgard architectures - static std::map<DataType, FunctionExecutorPtr> gemm_configs = - { - { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_configs = { + {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}}; const DataType data_type = params.data_type; - if(gemm_configs.find(data_type) != gemm_configs.end()) + if (gemm_configs.find(data_type) != gemm_configs.end()) { return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); } @@ -68,7 +67,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec ARM_COMPUTE_ERROR("Not supported data type"); } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); @@ -76,7 +76,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(n, k, b); @@ -84,7 +85,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant); diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp index 9e779d3752..851e23bc84 100644 --- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp +++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <map> @@ -34,8 +35,7 @@ namespace arm_compute { namespace cl_gemm { -CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) - : ICLGEMMKernelSelection(gpu) +CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu) { } @@ -44,135 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec // _target could be used in the future to have a dedicated heuristic for each GPU IP ARM_COMPUTE_UNUSED(_target); - using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); + using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant); // Default configurations for Valhall architectures - static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G77 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G78 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G710 and Mali-G610 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; // Mali-G715 and Mali-G615 configurations - static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = - { - { DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32 }, - { DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16 }, - { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 }, - { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 } - }; + static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = { + {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32}, + {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16}, + {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8}, + {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}}; const DataType data_type = params.data_type; - switch(_target) + switch (_target) { case GPUTarget::G710: case GPUTarget::G610: - if(gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) + if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end()) { - return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G715: case GPUTarget::G615: - if(gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) + if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end()) { - return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G78: - if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) + if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end()) { - return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); case GPUTarget::G77: - if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) + if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end()) { - return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); default: - if(gemm_default_configs.find(data_type) != gemm_default_configs.end()) + if (gemm_default_configs.find(data_type) != gemm_default_configs.end()) { - return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant); + return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, + params.is_rhs_constant); } ARM_COMPUTE_ERROR("Not supported data type"); } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(is_rhs_constant) + if (is_rhs_constant) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -182,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } - if(m == 1) + if (m == 1) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } - if(n <= 272.0000f) + if (n <= 272.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(k <= 471.0000f) + if (k <= 471.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 72.5000f) + if (m <= 72.5000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - if(m <= 90.5000f) + if (m <= 90.5000f) { return CLGEMMKernelType::RESHAPED; } else { - if(k <= 2448.0000f) + if (k <= 2448.0000f) { - if(n <= 756.0000f) + if (n <= 756.0000f) { return CLGEMMKernelType::RESHAPED_ONLY_RHS; } @@ -241,11 +243,12 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { ARM_COMPUTE_UNUSED(m, n, k, b); - if(!is_rhs_constant) + if (!is_rhs_constant) { return CLGEMMKernelType::NATIVE; } @@ -253,9 +256,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int return CLGEMMKernelType::RESHAPED_ONLY_RHS; } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return default_f32(m, n, k, b, is_rhs_constant); } @@ -263,7 +267,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int unsigned int best_m0; unsigned int best_n0; - if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) { return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; } @@ -273,9 +277,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int } } -CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) +CLGEMMKernelType +CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant) { - if(!is_rhs_constant) + if (!is_rhs_constant) { return g78_f16(m, n, k, b, is_rhs_constant); } @@ -283,7 +288,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int unsigned int best_m0; unsigned int best_n0; - if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) { return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL; } diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h index 6189a324cf..c528dbcac4 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h +++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h @@ -25,6 +25,7 @@ #define SRC_CLGEMMKERNELSELECTION_H #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + #include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h" #include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h" @@ -45,7 +46,7 @@ public: */ static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu); diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp index b06c3b0f8e..8df57197e2 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" #include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h" @@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_ bool valid = false; CLGEMMKernelType gemm_type{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", + to_string(gemm_type).c_str()); } else { @@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr<IClGemmKernelConfig> gemm_config = + ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query) @@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshapedOnlyRHS config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, - config.export_cl_image); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, + !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query) { GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; - std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); + std::unique_ptr<IClGemmKernelConfig> gemm_config = + ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) @@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigReshaped config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); - std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs, - config.transpose_rhs, config.export_cl_image); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); + std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, + config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) @@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query) std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type); - return GEMMConfigResult{ true, lhs_info, rhs_info }; + return GEMMConfigResult{true, lhs_info, rhs_info}; } GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) @@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query) GEMMRHSMatrixInfo rhs_info; mlgo::GEMMConfigNative config{}; const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics(); - if(mlgo_heuristics != nullptr) + if (mlgo_heuristics != nullptr) { - std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b }); + std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native( + mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b}); } - if(valid) + if (valid) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", + to_string(config).c_str()); // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter - std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); + std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info( + query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false); } else { ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed"); } - return GEMMConfigResult{ valid, lhs_info, rhs_info }; + return GEMMConfigResult{valid, lhs_info, rhs_info}; } } // namespace auto_heuristics } // namespace cl_gemm -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h index 020237b7f4..f544715e03 100644 --- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h +++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h @@ -50,8 +50,7 @@ struct CommonQuery /** Result of querying about GEMM type ( @ref CLGEMMKernelType) */ struct GEMMTypeResult { - GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) - : valid{ valid }, gemm_type{ gemm_type } + GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type} { } /** Test if the result is valid */ @@ -67,7 +66,7 @@ struct GEMMTypeResult struct GEMMConfigResult { GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info) - : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info } + : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info} { } /** Test if the result is valid */ @@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query); } // namespace cl_gemm } // namespace arm_compute -#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
\ No newline at end of file +#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h index c451bd9062..08a7ee8c18 100644 --- a/src/runtime/CL/mlgo/Common.h +++ b/src/runtime/CL/mlgo/Common.h @@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType; /** GEMM Configuration for Native kernel */ struct GEMMConfigNative { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ }; /** GEMM Configuration for Reshaped Only RHS kernel */ struct GEMMConfigReshapedOnlyRHS { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; /** GEMM Configuration for Reshaped kernel */ struct GEMMConfigReshaped { - unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */ - unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ - unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */ - unsigned int v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ false }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ - bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ - bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */ - bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ + unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */ + unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */ + unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */ + unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */ + bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */ + bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */ + bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */ }; } // namespace mlgo } // namespace arm_compute -#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
\ No newline at end of file +#endif // SRC_RUNTIME_CL_MLGO_COMMON_H diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp index 1c75cdc427..f7b706902b 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.cpp +++ b/src/runtime/CL/mlgo/HeuristicTree.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/HeuristicTree.h" + #include "arm_compute/core/Log.h" #include "support/Cast.h" @@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond) // PRE: all features and ConditionalOps are valid constexpr float eps = 0.0001f; // Calculate all secondary features - std::vector<std::pair<std::string, float>> cond_values - { - { "m", static_cast<float>(shape.m) }, - { "n", static_cast<float>(shape.n) }, - { "k", static_cast<float>(shape.k) }, - { "b", static_cast<float>(shape.b) }, - { "r_mn", static_cast<float>(shape.m) / shape.n }, - { "r_mk", static_cast<float>(shape.m) / shape.k }, - { "r_nk", static_cast<float>(shape.n) / shape.k }, - { "r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k) }, - { "workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0 } - }; - auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(), - [&cond](decltype(*cond_values.begin()) it) - { - return it.first == cond.feature; - }); + std::vector<std::pair<std::string, float>> cond_values{ + {"m", static_cast<float>(shape.m)}, + {"n", static_cast<float>(shape.n)}, + {"k", static_cast<float>(shape.k)}, + {"b", static_cast<float>(shape.b)}, + {"r_mn", static_cast<float>(shape.m) / shape.n}, + {"r_mk", static_cast<float>(shape.m) / shape.k}, + {"r_nk", static_cast<float>(shape.n) / shape.k}, + {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)}, + {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}}; + auto cond_value_pair_it = + std::find_if(cond_values.begin(), cond_values.end(), + [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; }); ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end()); const float cond_value = cond_value_pair_it->second; - switch(cond.op) + switch (cond.op) { case ConditionalOp::LT: { @@ -92,13 +89,12 @@ constexpr size_t HeuristicTree::_max_num_nodes; constexpr size_t HeuristicTree::_max_query_depth; constexpr HeuristicTree::NodeID HeuristicTree::_root; -HeuristicTree::HeuristicTree() - : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) +HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32) { } HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type) - : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{} + : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{} { } @@ -108,16 +104,17 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const // Root ID = 0; auto cur_node = _tree.at(_root).get(); size_t depth = 0; - while(cur_node->type() != NodeType::Leaf) + while (cur_node->type() != NodeType::Leaf) { - if(depth > _max_query_depth) + if (depth > _max_query_depth) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", + _max_query_depth); return std::make_pair(false, T{}); } ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType"); auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node); - if(evaluate(shape, br_node->condition)) + if (evaluate(shape, br_node->condition)) { cur_node = _tree.at(br_node->true_node).get(); } @@ -135,12 +132,12 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const template <typename T> bool HeuristicTree::add_leaf(NodeID id, T val) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val) bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node) { - if(_tree.size() >= _max_num_nodes) + if (_tree.size() >= _max_num_nodes) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes); return false; } - const std::set<std::string> supported_features = - { - "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload" - }; - const auto orig_feature = cond.feature; - std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c) - { - return std::tolower(c); - }); - if(supported_features.find(cond.feature) == supported_features.end()) + const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"}; + const auto orig_feature = cond.feature; + std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), + [](char c) { return std::tolower(c); }); + if (supported_features.find(cond.feature) == supported_features.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str()); return false; } - if(_tree.find(id) != _tree.end()) + if (_tree.find(id) != _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id); return false; @@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID bool HeuristicTree::check_if_structurally_correct() const { std::set<NodeID> visited; - std::deque<NodeID> to_visit{ _root }; + std::deque<NodeID> to_visit{_root}; - while(!to_visit.empty()) + while (!to_visit.empty()) { auto id = to_visit.front(); to_visit.pop_front(); - if(_tree.find(id) == _tree.end()) + if (_tree.find(id) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id); return false; } auto not_seen_before = visited.insert(id); - if(!not_seen_before.second) + if (!not_seen_before.second) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops"); return false; } auto cur_node = _tree.at(id).get(); - if(cur_node->type() == NodeType::Branch) + if (cur_node->type() == NodeType::Branch) { auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node); to_visit.push_back(br_node->true_node); to_visit.push_back(br_node->false_node); } } - if(visited.size() != _tree.size()) + if (visited.size() != _tree.size()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes"); return false; @@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const bool HeuristicTree::check() { - if(_tree.empty()) + if (_tree.empty()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered"); return false; } - if(_tree.find(_root) == _tree.end()) + if (_tree.find(_root) == _tree.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root); return false; @@ -237,7 +229,8 @@ template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shap /** Explicit template instantiation @relates HeuristicTree */ template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ -template std::pair<bool, GEMMConfigReshapedOnlyRHS> HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const; +template std::pair<bool, GEMMConfigReshapedOnlyRHS> +HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const; /** Explicit template instantiation @relates HeuristicTree */ template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const; diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h index d5c7de2215..a4f8c116b9 100644 --- a/src/runtime/CL/mlgo/HeuristicTree.h +++ b/src/runtime/CL/mlgo/HeuristicTree.h @@ -25,6 +25,7 @@ #define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H #include "arm_compute/core/Types.h" + #include "src/runtime/CL/mlgo/Common.h" #include <map> @@ -84,7 +85,7 @@ public: struct BranchNode : public Node { BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node) - : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node } + : id{id}, condition{cond}, true_node{t_node}, false_node{f_node} { } NodeType type() const override @@ -100,8 +101,7 @@ public: template <typename T> struct LeafNode : public Node { - LeafNode(NodeID id, T val) - : id{ id }, value{ val } + LeafNode(NodeID id, T val) : id{id}, value{val} { } NodeType type() const override @@ -177,22 +177,22 @@ public: bool check(); private: - static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query - static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree - static constexpr NodeID _root{ 0 }; // Root tree ID + static constexpr size_t _max_query_depth{1000}; // Maximum depth of query + static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree + static constexpr NodeID _root{0}; // Root tree ID private: bool check_if_structurally_correct() const; private: - TreeID _id; /**< Heuristic tree ID */ - HeuristicType _heuristic_type; /**< Heuristic type */ - std::string _ip_target; /**< IP target associated with the tree */ - DataType _data_type; /**< Data type associated with the tree */ - std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */ + TreeID _id; /**< Heuristic tree ID */ + HeuristicType _heuristic_type; /**< Heuristic type */ + std::string _ip_target; /**< IP target associated with the tree */ + DataType _data_type; /**< Data type associated with the tree */ + std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp index 80f3bb85e9..aed46cd80f 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp +++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp @@ -24,6 +24,7 @@ #include "src/runtime/CL/mlgo/MLGOHeuristics.h" #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/MLGOParser.h" #include "src/runtime/CL/mlgo/Utils.h" @@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs) } bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, - rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == + std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs) { - return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, - rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); + return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, + lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs, + rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image); } constexpr size_t MLGOHeuristics::_max_num_trees; -MLGOHeuristics::MLGOHeuristics() - : _indices{}, _trees{}, _tree_valid{}, _valid{ false } +MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false} { } @@ -59,71 +60,74 @@ std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) co { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str()); const auto invalid = GEMMType::RESHAPED; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMType>(shape_query); } std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigNative{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMConfigNative>(shape_query); } std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshapedOnlyRHS{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query); } std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", + to_string(query).c_str()); const auto invalid = GEMMConfigReshaped{}; - if(!_valid) + if (!_valid) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead"); - return { false, invalid }; + return {false, invalid}; } auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type); - GEMMShape shape_query{ query.m, query.n, query.k, query.b }; - if(_trees.find(index) == _trees.end()) + GEMMShape shape_query{query.m, query.n, query.k, query.b}; + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); - return { false, invalid }; + return {false, invalid}; } return _trees.at(index).query<GEMMConfigReshaped>(shape_query); } @@ -131,14 +135,14 @@ std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(c bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) { bool status; - HeuristicTree *tree{ nullptr }; + HeuristicTree *tree{nullptr}; std::tie(status, tree) = get_heuristic_tree(id); - if(!status) + if (!status) { return status; } status = tree->check(); - if(!status) + if (!status) { return status; } @@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id) bool MLGOHeuristics::check_all() const { // Tree validities are already checked and cached. - bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) - { - return !v.second; - }) - == _tree_valid.end(); - if(!all_trees_are_checked) + bool all_trees_are_checked = + std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end(); + if (!all_trees_are_checked) { - ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo"); + ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each " + "tree is completed. This could also indicate there are no trees in the dotmlgo"); return false; } @@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id) { - if(_indices.find(id) == _indices.end()) + if (_indices.find(id) == _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id); return std::make_pair(false, nullptr); } const auto index = _indices[id]; - if(_trees.find(index) == _trees.end()) + if (_trees.find(index) == _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index"); return std::make_pair(false, nullptr); @@ -186,7 +188,7 @@ std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTre bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) { - if(_indices.size() >= _max_num_trees) + if (_indices.size() >= _max_num_trees) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees); return false; @@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // PRE: correctness of t is guaranteed by the tree construction process // Ensure unique id const auto id = t.id(); - if(_indices.find(id) != _indices.end()) + if (_indices.find(id) != _indices.end()) { ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id); return false; @@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t) // Ensure unique index const auto index = t.index(); - if(_trees.find(index) != _trees.end()) + if (_trees.find(index) != _trees.end()) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists"); return false; @@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) std::ifstream fs; fs.exceptions(std::ifstream::badbit); fs.open(filename, std::ios::in); - if(!fs.is_open()) + if (!fs.is_open()) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", + filename.c_str()); return _valid = false; } return reload_from_stream(fs); @@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename) bool MLGOHeuristics::reload_from_stream(std::istream &in) { auto parsed = parser::parse_mlgo(in); - if(!parsed.first) + if (!parsed.first) { ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead"); return _valid = false; @@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in) } } // namespace mlgo -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h index aa21225959..6a491c5503 100644 --- a/src/runtime/CL/mlgo/MLGOHeuristics.h +++ b/src/runtime/CL/mlgo/MLGOHeuristics.h @@ -135,16 +135,16 @@ public: bool check_all() const; private: - static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/ + static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/ private: // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices; /**< A mapping from TreeID to Index */ std::map<HeuristicTree::Index, HeuristicTree> _trees; /**< A mapping from Index to HeuristicTree */ std::map<HeuristicTree::TreeID, bool> _tree_valid; /**< Result cache of the tree validity checks */ - bool _valid; /**< Overall validity */ + bool _valid; /**< Overall validity */ }; } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp index 625739e450..893daf2ed9 100644 --- a/src/runtime/CL/mlgo/MLGOParser.cpp +++ b/src/runtime/CL/mlgo/MLGOParser.cpp @@ -22,19 +22,21 @@ * SOFTWARE. */ #include "src/runtime/CL/mlgo/MLGOParser.h" + #include "arm_compute/core/Log.h" + #include "src/runtime/CL/mlgo/Utils.h" #include <sstream> #define CHECK(parser_expr, valid_var) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return; #define CHECK_DEFAULT(parser_expr, valid_var, default_val) \ (parser_expr); \ - if(!valid_var) \ + if (!valid_var) \ return default_val; #ifdef ARM_COMPUTE_LOGGING_ENABLED @@ -53,8 +55,7 @@ valid_var = false; \ return default_val; -#define LOG_TOKEN_POS(tokens, pos_var) \ - const auto pos_var = tokens.current_pos(); +#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos(); #else // ARM_COMPUTE_LOGGING_ENABLED @@ -73,19 +74,12 @@ namespace { void ltrim(std::string &str) { - str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) - { - return !std::isspace(ch); - })); + str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); })); } void rtrim(std::string &str) { - str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) - { - return !std::isspace(ch); - }).base(), - str.end()); + str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end()); } void trim(std::string &str) @@ -109,7 +103,7 @@ enum class ComparatorType }; TokenStream::TokenStream(std::istream &s, const std::string &delims) - : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{} + : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{} { read(); } @@ -125,7 +119,7 @@ Token TokenStream::take() ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); Token t = _tokens.front(); _tokens.pop_front(); - if(_tokens.empty()) + if (_tokens.empty()) { read(); } @@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i) ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty"); ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead"); // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end - while(_istream && _tokens.size() <= i) + while (_istream && _tokens.size() <= i) { read(); } @@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i) void advance(CharPosition &pos, char ch) { - if(ch == '\n') + if (ch == '\n') { pos.ln += 1; pos.col = 0; @@ -167,17 +161,16 @@ void TokenStream::read() do { // Reached eof - if(!_istream.get(ch)) + if (!_istream.get(ch)) { - if(!reached_end()) + if (!reached_end()) { _tokens.emplace_back(TokenType::End, "", _lookahead_pos); } return; } advance(_lookahead_pos, ch); - } - while(std::isspace(ch) || is_delim(ch)); + } while (std::isspace(ch) || is_delim(ch)); // Read chars until we hit a delim or eof auto orig_pos = _lookahead_pos; auto tok = recognize_tok(ch); @@ -190,41 +183,41 @@ void TokenStream::read() Token TokenStream::recognize_tok(char ch) { - if(ch == '[') + if (ch == '[') { - return Token{ TokenType::L_List, "", _lookahead_pos }; + return Token{TokenType::L_List, "", _lookahead_pos}; } - else if(ch == ']') + else if (ch == ']') { - return Token{ TokenType::R_List, "", _lookahead_pos }; + return Token{TokenType::R_List, "", _lookahead_pos}; } - else if(ch == '.') + else if (ch == '.') { - return float_after_dp_st(std::string{ ch }); + return float_after_dp_st(std::string{ch}); } - else if(std::isdigit(ch)) + else if (std::isdigit(ch)) { - return num_st(std::string{ ch }); + return num_st(std::string{ch}); } else { - return text_st(std::string{ ch }); + return text_st(std::string{ch}); } } Token TokenStream::num_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(ch == '.') + if (ch == '.') { return float_after_dp_st(value + ch); } - else if(!std::isdigit(ch)) + else if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value) } value += ch; } - return Token{ TokenType::Int, value, _lookahead_pos }; + return Token{TokenType::Int, value, _lookahead_pos}; } Token TokenStream::float_after_dp_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(!std::isdigit(ch)) + if (!std::isdigit(ch)) { - if(!is_delim(ch) && !std::isspace(ch)) + if (!is_delim(ch) && !std::isspace(ch)) { rewind(_lookahead_pos); _istream.unget(); @@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value) } value += ch; } - return Token{ TokenType::Float, value, _lookahead_pos }; + return Token{TokenType::Float, value, _lookahead_pos}; } Token TokenStream::text_st(std::string value) { char ch{}; - while(_istream.get(ch)) + while (_istream.get(ch)) { advance(_lookahead_pos, ch); - if(is_delim(ch)) + if (is_delim(ch)) { break; } - if(ch == '[' || ch == ']') + if (ch == '[' || ch == ']') { rewind(_lookahead_pos); _istream.unget(); @@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value) } value += ch; } - return Token{ TokenType::Text, value, _lookahead_pos }; + return Token{TokenType::Text, value, _lookahead_pos}; } bool TokenStream::reached_end() const @@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::End) + if (tok.type != TokenType::End) { FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream"); } @@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token"); } @@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Int) + if (tok.type != TokenType::Int) { FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token"); } @@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); int val = CHECK_DEFAULT(int_val(in, valid), valid, 0); - if(val < 0) + if (val < 0) { FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token"); } @@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Float) + if (tok.type != TokenType::Float) { FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token"); } @@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); auto tok = in.take(); - if(tok.type != TokenType::Text || tok.value.empty()) + if (tok.type != TokenType::Text || tok.value.empty()) { FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token"); } @@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid) bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) { auto tok = in.peek(); - if(tok.type == TokenType::Text && tok.value == c_str) + if (tok.type == TokenType::Text && tok.value == c_str) { - if(take) + if (take) { in.take(); } @@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true) void expect_text(TokenStream &in, const std::string &str, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_text(in, str)) + if (!accept_text(in, str)) { FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str); } @@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid) bool accept_l_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::L_List) + if (tok.type == TokenType::L_List) { in.take(); return true; @@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in) void expect_l_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_l_list(in)) + if (!accept_l_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect '['"); } @@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid) bool accept_r_list(TokenStream &in) { auto tok = in.peek(); - if(tok.type == TokenType::R_List) + if (tok.type == TokenType::R_List) { in.take(); return true; @@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in) void expect_r_list(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(!accept_r_list(in)) + if (!accept_r_list(in)) { FAIL_WITH_MSG(valid, pos, "Expect ']'"); } @@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid) ConditionalOp conditional_op(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "<=")) + if (accept_text(in, "<=")) { return ConditionalOp::LE; } - else if(accept_text(in, ">=")) + else if (accept_text(in, ">=")) { return ConditionalOp::GE; } - else if(accept_text(in, "==")) + else if (accept_text(in, "==")) { return ConditionalOp::EQ; } - else if(accept_text(in, "<")) + else if (accept_text(in, "<")) { return ConditionalOp::LT; } - else if(accept_text(in, ">")) + else if (accept_text(in, ">")) { return ConditionalOp::GT; } @@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid) { CHECK(expect_text(in, "ip-type", valid), valid); LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gpu")) + if (accept_text(in, "gpu")) { ; } - else if(accept_text(in, "cpu")) + else if (accept_text(in, "cpu")) { ; } @@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid) DataType data_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "f16")) + if (accept_text(in, "f16")) { return DataType::F16; } - else if(accept_text(in, "f32")) + else if (accept_text(in, "f32")) { return DataType::F32; } - else if(accept_text(in, "qasymm8")) + else if (accept_text(in, "qasymm8")) { return DataType::QASYMM8; } @@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid) ComparatorType comparator_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "var")) + if (accept_text(in, "var")) { return ComparatorType::Var; } - else if(accept_text(in, "num")) + else if (accept_text(in, "num")) { return ComparatorType::Num; } - else if(accept_text(in, "enum")) + else if (accept_text(in, "enum")) { return ComparatorType::Enum; } @@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid) HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "gemm-type", take)) + if (accept_text(in, "gemm-type", take)) { return HeuristicType::GEMM_Type; } - else if(accept_text(in, "gemm-config-native", take)) + else if (accept_text(in, "gemm-config-native", take)) { return HeuristicType::GEMM_Config_Native; } - else if(accept_text(in, "gemm-config-reshaped-only-rhs", take)) + else if (accept_text(in, "gemm-config-reshaped-only-rhs", take)) { return HeuristicType::GEMM_Config_Reshaped_Only_RHS; } - else if(accept_text(in, "gemm-config-reshaped", take)) + else if (accept_text(in, "gemm-config-reshaped", take)) { return HeuristicType::GEMM_Config_Reshaped; } @@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val { LOG_TOKEN_POS(in, pos); auto ht = CHECK(heuristic_type(in, valid, false), valid); - if(ht != expected_ht) + if (ht != expected_ht) { FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type"); } @@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val GEMMType gemm_type(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "native")) + if (accept_text(in, "native")) { return GEMMType::NATIVE; } - else if(accept_text(in, "reshaped-only-rhs")) + else if (accept_text(in, "reshaped-only-rhs")) { return GEMMType::RESHAPED_ONLY_RHS; } - else if(accept_text(in, "reshaped")) + else if (accept_text(in, "reshaped")) { return GEMMType::RESHAPED; } @@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid) const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigNative{ m0, n0, k0 }; + return GEMMConfigNative{m0, n0, k0}; } GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid) @@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex }; + return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex}; } GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) @@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid) const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val); CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val); - return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex }; + return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex}; } void gpu_priority(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "best-performance")) + if (accept_text(in, "best-performance")) { ; } - else if(accept_text(in, "best-memory-usage")) + else if (accept_text(in, "best-memory-usage")) { ; } @@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid) void gpu_behavior(TokenStream &in, bool &valid) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "static")) + if (accept_text(in, "static")) { ; } - else if(accept_text(in, "dynamic")) + else if (accept_text(in, "dynamic")) { ; } @@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid) void free_vars(TokenStream &in, bool &valid) { CHECK(expect_l_list(in, valid), valid); - while(!accept_r_list(in)) + while (!accept_r_list(in)) { CHECK(text_val(in, valid), valid); } @@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid) void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid) { CHECK(expect_text(in, "<heuristics-table>", valid), valid); - while(!accept_text(in, "</heuristics-table>")) + while (!accept_text(in, "</heuristics-table>")) { CHECK(heuristics_table_entry(in, h, valid), valid); } @@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid) const auto c_o = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val); const auto r_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val); const auto r_v = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val); - if(l_t != ComparatorType::Var || r_t != ComparatorType::Num) + if (l_t != ComparatorType::Var || r_t != ComparatorType::Num) { - FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); + FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, + "Only accept LHS type to be Var (string) and RHS type to be Num (float)"); } - return Condition{ l_v, c_o, r_v }; + return Condition{l_v, c_o, r_v}; } void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) @@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) CHECK(expect_text(in, "<heuristic", valid), valid); const auto tree_id = CHECK(uint_val(in, valid), valid); CHECK(expect_text(in, ">", valid), valid); - HeuristicTree *t = nullptr; - std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); + HeuristicTree *t = nullptr; + std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid); const HeuristicType t_heuristic_type = std::get<0>(t->index()); - while(!accept_text(in, "</heuristic>")) + while (!accept_text(in, "</heuristic>")) { LOG_TOKEN_POS(in, pos); - if(accept_text(in, "b")) + if (accept_text(in, "b")) { // Branch node const auto id = CHECK(uint_val(in, valid), valid); @@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) const auto f_id = CHECK(uint_val(in, valid), valid); valid = CHECK(t->add_branch(id, cond, t_id, f_id), valid); } - else if(accept_text(in, "l")) + else if (accept_text(in, "l")) { // Leaf node const auto id = CHECK(uint_val(in, valid), valid); @@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid) // heuristic table). For now it remains as a step for validation. LOG_TOKEN_POS(in, pos); CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid); - switch(t_heuristic_type) + switch (t_heuristic_type) { case HeuristicType::GEMM_Type: { @@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid) MLGOHeuristics h; CHECK_DEFAULT(header(in, valid), valid, h); CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h); - while(accept_text(in, "<heuristic", false)) + while (accept_text(in, "<heuristic", false)) { CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h); } @@ -809,4 +803,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in) #undef CHECK #undef CHECK_DEFAULT #undef FAIL_WITH_MSG -#undef FAIL_WITH_MSG_DEFAULT
\ No newline at end of file +#undef FAIL_WITH_MSG_DEFAULT diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h index 49d8b9c644..cffce8d6a1 100644 --- a/src/runtime/CL/mlgo/MLGOParser.h +++ b/src/runtime/CL/mlgo/MLGOParser.h @@ -98,15 +98,14 @@ struct CharPosition return ln == other.ln && col == other.col; } - size_t ln{ 0 }; - size_t col{ 0 }; + size_t ln{0}; + size_t col{0}; }; /** Token */ struct Token { - Token(TokenType t, std::string v, CharPosition pos) - : type{ t }, value{ v }, pos{ pos } + Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos} { } @@ -196,4 +195,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in); } // namespace parser } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp index 81d418c28e..c7e0100b3c 100644 --- a/src/runtime/CL/mlgo/Utils.cpp +++ b/src/runtime/CL/mlgo/Utils.cpp @@ -43,40 +43,38 @@ inline std::string to_str(const T &val) std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config) { return os << "Native:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config) { return os << "ReshapedOnlyRHS:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config) { return os << "Reshaped:{" - << "m0: " << config.m0 << ", " - << "n0: " << config.n0 << ", " - << "k0: " << config.k0 << ", " - << "v0: " << config.v0 << ", " - << "h0: " << config.h0 << ", " - << "interleave_lhs: " << config.interleave_lhs << ", " - << "interleave_rhs: " << config.interleave_rhs << ", " - << "transpose_rhs: " << config.transpose_rhs << ", " - << "export_cl_image: " << config.export_cl_image - << "}"; + << "m0: " << config.m0 << ", " + << "n0: " << config.n0 << ", " + << "k0: " << config.k0 << ", " + << "v0: " << config.v0 << ", " + << "h0: " << config.h0 << ", " + << "interleave_lhs: " << config.interleave_lhs << ", " + << "interleave_rhs: " << config.interleave_rhs << ", " + << "transpose_rhs: " << config.transpose_rhs << ", " + << "export_cl_image: " << config.export_cl_image << "}"; } std::ostream &operator<<(std::ostream &os, HeuristicType ht) { - switch(ht) + switch (ht) { case HeuristicType::GEMM_Type: { @@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht) } std::ostream &operator<<(std::ostream &os, DataType dt) { - switch(dt) + switch (dt) { case DataType::F32: { @@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos) } // namespace mlgo -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h index c634a887e9..73b537f476 100644 --- a/src/runtime/CL/mlgo/Utils.h +++ b/src/runtime/CL/mlgo/Utils.h @@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht); std::ostream &operator<<(std::ostream &os, DataType dt); std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index); std::ostream &operator<<(std::ostream &os, const Query &query); -std::string to_string(const GEMMConfigNative &config); -std::string to_string(const GEMMConfigReshapedOnlyRHS &config); -std::string to_string(const GEMMConfigReshaped &config); -std::string to_string(const Query &query); +std::string to_string(const GEMMConfigNative &config); +std::string to_string(const GEMMConfigReshapedOnlyRHS &config); +std::string to_string(const GEMMConfigReshaped &config); +std::string to_string(const Query &query); namespace parser { std::ostream &operator<<(std::ostream &os, const CharPosition &pos); @@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos); } // namespace mlgo } // namespace arm_compute -#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
\ No newline at end of file +#endif //SRC_RUNTIME_CL_MLGO_UTILS_H diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 6f3e32491a..5e3907f1ea 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -27,20 +27,20 @@ namespace arm_compute { namespace cl_tuner { -constexpr unsigned int max_lws_supported_x{ 64u }; -constexpr unsigned int max_lws_supported_y{ 32u }; -constexpr unsigned int max_lws_supported_z{ 32u }; +constexpr unsigned int max_lws_supported_x{64u}; +constexpr unsigned int max_lws_supported_y{32u}; +constexpr unsigned int max_lws_supported_z{32u}; /** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */ class CLTuningParametersList : public ICLTuningParametersList { protected: /* Shape of 4-D search space */ - TensorShape search_space_shape{ 0, 0, 0, 0 }; - std::vector<unsigned int> _lws_x{ 0 }; - std::vector<unsigned int> _lws_y{ 0 }; - std::vector<unsigned int> _lws_z{ 0 }; - std::vector<int> _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units. + TensorShape search_space_shape{0, 0, 0, 0}; + std::vector<unsigned int> _lws_x{0}; + std::vector<unsigned int> _lws_y{0}; + std::vector<unsigned int> _lws_z{0}; + std::vector<int> _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units. The value is in the range [-31,+31]. When 0, the runtime-selected wbs used is unmodified. */ @@ -116,7 +116,8 @@ private: * @param[in] lws_max Max LWS value allowed to be tested * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one. */ - void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); + void + initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one); }; /** A minimal subset of LWS values that only have 1,2 and 4/8 */ @@ -170,9 +171,9 @@ CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDR search_space_shape[1] = lws_y_max; search_space_shape[2] = lws_z_max; search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -3, -2, -1, 0, 1, 2, 3 }; + _wbsm = {-3, -2, -1, 0, 1, 2, 3}; search_space_shape[3] = _wbsm.size(); } } @@ -194,26 +195,31 @@ CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gw _lws_x = {}; _lws_y = {}; _lws_z = {}; - initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 - initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_x, gws[0], lws_x_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 + initialize_lws_values(_lws_y, gws[1], lws_y_max, + gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_z, gws[2], lws_z_max, false); search_space_shape[0] = _lws_x.size(); search_space_shape[1] = _lws_y.size(); search_space_shape[2] = _lws_z.size(); search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -2, -1, 0, 1, 2 }; + _wbsm = {-2, -1, 0, 1, 2}; search_space_shape[3] = _wbsm.size(); } } -void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) +void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, + unsigned int gws, + unsigned int lws_max, + bool mod_let_one) { lws.push_back(1); - for(unsigned int i = 2; i <= lws_max; ++i) + for (unsigned int i = 2; i <= lws_max; ++i) { // Power of two condition const bool is_power_of_two = (i & (i - 1)) == 0; @@ -221,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in // Condition for the module accordingly with the mod_let_one flag const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0; - if(mod_cond || is_power_of_two) + if (mod_cond || is_power_of_two) { lws.push_back(i); } @@ -246,9 +252,9 @@ CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, search_space_shape[1] = _lws_y.size(); search_space_shape[2] = _lws_z.size(); search_space_shape[3] = 1; - if(tuning_info.tune_wbsm) + if (tuning_info.tune_wbsm) { - _wbsm = { -1, 0, 1 }; + _wbsm = {-1, 0, 1}; search_space_shape[3] = _wbsm.size(); } } @@ -257,7 +263,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int { lws.push_back(1); - for(unsigned int i = 2; i <= lws_max; i *= 4) + for (unsigned int i = 2; i <= lws_max; i *= 4) { lws.push_back(i); } @@ -265,7 +271,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws) { - switch(tuning_info.tuner_mode) + switch (tuning_info.tuner_mode) { case CLTunerMode::EXHAUSTIVE: return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info); |